Skip to content

Post-Processing

The postprocess module provides utilities for cleaning up delineated field boundary polygons, including polygonization, simplification, regularization, area filtering, cross-tile merging, and LULC-based crop filtering.

The lulc_filter submodule automatically removes non-agricultural polygons using NLCD (US), Dynamic World (global, ≥2016), or C3S Land Cover (global, pre-Sentinel). This is integrated into the main pipeline and enabled by default.

postprocess

Post-processing utilities for field boundary polygons.

Provides shared functionality for polygonization, simplification, regularization, area filtering, and cross-tile merging.

filter_polygons

filter_polygons(gdf: GeoDataFrame, min_area_m2: float = 2500.0, max_area_m2: float | None = None, remove_holes_below_m2: float | None = None, lulc_mask_path: str | None = None, lulc_agricultural_classes: list[int] | None = None) -> gpd.GeoDataFrame

Filter field boundary polygons by area and optional LULC mask.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input polygons.

required
min_area_m2 float

Minimum polygon area in m** (default 2500).

2500.0
max_area_m2 float or None

Maximum polygon area in m. None means no upper limit.

None
remove_holes_below_m2 float or None

Remove interior rings (holes) smaller than this threshold.

None
lulc_mask_path str or None

Path to a LULC raster for filtering non-agricultural areas.

None
lulc_agricultural_classes list[int] or None

LULC class values considered agricultural.

None

Returns:

Type Description
GeoDataFrame

Filtered polygons.

Source code in agribound/postprocess/filter.py
def filter_polygons(
    gdf: gpd.GeoDataFrame,
    min_area_m2: float = 2500.0,
    max_area_m2: float | None = None,
    remove_holes_below_m2: float | None = None,
    lulc_mask_path: str | None = None,
    lulc_agricultural_classes: list[int] | None = None,
) -> gpd.GeoDataFrame:
    """Filter field boundary polygons by area and optional LULC mask.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input polygons.
    min_area_m2 : float
        Minimum polygon area in m** (default 2500).
    max_area_m2 : float or None
        Maximum polygon area in m**. *None* means no upper limit.
    remove_holes_below_m2 : float or None
        Remove interior rings (holes) smaller than this threshold.
    lulc_mask_path : str or None
        Path to a LULC raster for filtering non-agricultural areas.
    lulc_agricultural_classes : list[int] or None
        LULC class values considered agricultural.

    Returns
    -------
    geopandas.GeoDataFrame
        Filtered polygons.
    """
    if len(gdf) == 0:
        return gdf

    result = gdf.copy()

    # Compute area in equal-area CRS
    ea_crs = get_equal_area_crs()
    result_ea = result.to_crs(ea_crs)
    result["area_m2"] = result_ea.geometry.area
    result["perimeter_m"] = result_ea.geometry.length

    initial_count = len(result)

    # Area filtering
    if min_area_m2 > 0:
        result = result[result["area_m2"] >= min_area_m2]

    if max_area_m2 is not None:
        result = result[result["area_m2"] <= max_area_m2]

    # Remove small holes
    if remove_holes_below_m2 is not None:
        result["geometry"] = result.geometry.apply(
            lambda g: _remove_small_holes(g, remove_holes_below_m2, ea_crs, result.crs)
        )

    # LULC filtering
    if lulc_mask_path is not None and lulc_agricultural_classes is not None:
        result = _filter_by_lulc(result, lulc_mask_path, lulc_agricultural_classes)

    removed = initial_count - len(result)
    if removed > 0:
        logger.info(
            "Filtered %d polygons (min=%.0f m2, max=%s m2): %d remaining",
            removed,
            min_area_m2,
            max_area_m2 or "inf",
            len(result),
        )

    return result.reset_index(drop=True)

merge_polygons

merge_polygons(gdf: GeoDataFrame, iou_threshold: float = 0.3, containment_threshold: float = 0.8) -> gpd.GeoDataFrame

Merge overlapping or adjacent polygons.

Uses R-tree spatial indexing for efficient overlap detection.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input polygons (potentially with cross-tile duplicates).

required
iou_threshold float

IoU threshold above which polygons are merged (default 0.3).

0.3
containment_threshold float

If one polygon contains this fraction of another, merge them (default 0.8).

0.8

Returns:

Type Description
GeoDataFrame

Merged polygons with no duplicates.

Source code in agribound/postprocess/merge.py
def merge_polygons(
    gdf: gpd.GeoDataFrame,
    iou_threshold: float = 0.3,
    containment_threshold: float = 0.8,
) -> gpd.GeoDataFrame:
    """Merge overlapping or adjacent polygons.

    Uses R-tree spatial indexing for efficient overlap detection.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input polygons (potentially with cross-tile duplicates).
    iou_threshold : float
        IoU threshold above which polygons are merged (default 0.3).
    containment_threshold : float
        If one polygon contains this fraction of another, merge them
        (default 0.8).

    Returns
    -------
    geopandas.GeoDataFrame
        Merged polygons with no duplicates.
    """
    if len(gdf) <= 1:
        return gdf

    # Build spatial index
    sindex = gdf.sindex

    # Union-Find for tracking merge groups
    parent = list(range(len(gdf)))

    def find(i):
        while parent[i] != i:
            parent[i] = parent[parent[i]]
            i = parent[i]
        return i

    def union(i, j):
        ri, rj = find(i), find(j)
        if ri != rj:
            parent[ri] = rj

    # Find merge candidates
    for i, row in gdf.iterrows():
        geom_i = row.geometry
        if geom_i is None or geom_i.is_empty:
            continue

        candidates = list(sindex.intersection(geom_i.bounds))
        for j in candidates:
            if j <= i:
                continue
            if find(i) == find(j):
                continue

            geom_j = gdf.iloc[j].geometry
            if geom_j is None or geom_j.is_empty:
                continue

            if not geom_i.intersects(geom_j):
                continue

            try:
                intersection = geom_i.intersection(geom_j)
                intersection_area = intersection.area

                # IoU check
                union_area = geom_i.area + geom_j.area - intersection_area
                if union_area > 0:
                    iou = intersection_area / union_area
                    if iou >= iou_threshold:
                        union(i, j)
                        continue

                # Containment check
                min_area = min(geom_i.area, geom_j.area)
                if min_area > 0:
                    containment = intersection_area / min_area
                    if containment >= containment_threshold:
                        union(i, j)
            except Exception:
                continue

    # Group and merge
    groups: dict[int, list[int]] = {}
    for i in range(len(gdf)):
        root = find(i)
        groups.setdefault(root, []).append(i)

    merged_geoms = []
    for _root, members in groups.items():
        if len(members) == 1:
            merged_geoms.append(gdf.iloc[members[0]].geometry)
        else:
            geoms = [
                g.buffer(0) if not g.is_valid else g
                for m in members
                if (g := gdf.iloc[m].geometry) is not None
            ]
            if geoms:
                merged = unary_union(geoms)
                # Explode MultiPolygons
                if isinstance(merged, MultiPolygon):
                    merged_geoms.extend(merged.geoms)
                else:
                    merged_geoms.append(merged)

    result = gpd.GeoDataFrame(geometry=merged_geoms, crs=gdf.crs)
    n_merged = len(gdf) - len(result)
    if n_merged > 0:
        logger.info("Merged %d overlapping polygons → %d", len(gdf), len(result))

    return result.reset_index(drop=True)

polygonize_mask

polygonize_mask(mask_path: str, min_area_m2: float = 2500.0, band: int = 1, connectivity: int = 4, field_value: int | None = None) -> gpd.GeoDataFrame

Convert a raster segmentation mask to field boundary polygons.

Parameters:

Name Type Description Default
mask_path str

Path to the segmentation mask GeoTIFF. Non-zero values are treated as field pixels (or only field_value if specified).

required
min_area_m2 float

Minimum polygon area in m**2 to keep (default 2500).

2500.0
band int

Band index to polygonize (1-based, default 1).

1
connectivity int

Pixel connectivity for grouping (4 or 8, default 4).

4
field_value int or None

If set, only pixels equal to this value are polygonized. If None, all non-zero pixels are included.

None

Returns:

Type Description
GeoDataFrame

Field boundary polygons with geometry and class_value columns.

Source code in agribound/postprocess/polygonize.py
def polygonize_mask(
    mask_path: str,
    min_area_m2: float = 2500.0,
    band: int = 1,
    connectivity: int = 4,
    field_value: int | None = None,
) -> gpd.GeoDataFrame:
    """Convert a raster segmentation mask to field boundary polygons.

    Parameters
    ----------
    mask_path : str
        Path to the segmentation mask GeoTIFF. Non-zero values are
        treated as field pixels (or only *field_value* if specified).
    min_area_m2 : float
        Minimum polygon area in m**2 to keep (default 2500).
    band : int
        Band index to polygonize (1-based, default 1).
    connectivity : int
        Pixel connectivity for grouping (4 or 8, default 4).
    field_value : int or None
        If set, only pixels equal to this value are polygonized.
        If *None*, all non-zero pixels are included.

    Returns
    -------
    geopandas.GeoDataFrame
        Field boundary polygons with ``geometry`` and ``class_value`` columns.
    """
    with warnings.catch_warnings():
        # Some engines write invalid nodata (e.g. -inf for uint8); ignore.
        warnings.filterwarnings("ignore", message=".*nodata.*")
        with rasterio.open(mask_path) as src:
            mask = src.read(band)
            transform = src.transform
            crs = src.crs

    # Replace inf/nan with 0
    mask = np.where(np.isfinite(mask), mask, 0)

    # Ensure integer type
    if mask.dtype in (np.float32, np.float64):
        mask = mask.astype(np.int32)

    polygons = []
    values = []

    for geom, val in rio_shapes(mask, connectivity=connectivity, transform=transform):
        if val == 0:
            continue
        if field_value is not None and int(val) != field_value:
            continue
        poly = shapely_shape(geom)
        if poly.is_valid and not poly.is_empty:
            polygons.append(poly)
            values.append(int(val))

    if not polygons:
        logger.warning("No polygons extracted from mask %s", mask_path)
        return gpd.GeoDataFrame(columns=["geometry", "class_value"], crs=crs)

    gdf = gpd.GeoDataFrame({"class_value": values, "geometry": polygons}, crs=crs)

    # Area filtering
    if min_area_m2 > 0:
        from agribound.postprocess.filter import filter_polygons

        gdf = filter_polygons(gdf, min_area_m2=min_area_m2)

    logger.info("Polygonized %d features from %s", len(gdf), mask_path)
    return gdf

regularize_polygons

regularize_polygons(gdf: GeoDataFrame, method: str = 'adaptive', angle_threshold: float = 15.0) -> gpd.GeoDataFrame

Regularize field boundary polygon geometry.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input polygons.

required
method str

Regularization method: "orthogonal", "adaptive" (default), or "none".

'adaptive'
angle_threshold float

Maximum deviation angle for orthogonalization (degrees).

15.0

Returns:

Type Description
GeoDataFrame

Regularized polygons.

Source code in agribound/postprocess/regularize.py
def regularize_polygons(
    gdf: gpd.GeoDataFrame,
    method: str = "adaptive",
    angle_threshold: float = 15.0,
) -> gpd.GeoDataFrame:
    """Regularize field boundary polygon geometry.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input polygons.
    method : str
        Regularization method: ``"orthogonal"``, ``"adaptive"``
        (default), or ``"none"``.
    angle_threshold : float
        Maximum deviation angle for orthogonalization (degrees).

    Returns
    -------
    geopandas.GeoDataFrame
        Regularized polygons.
    """
    if len(gdf) == 0 or method == "none":
        return gdf

    try:
        if method == "orthogonal":
            from geoai.utils.geometry import orthogonalize

            result = gdf.copy()
            result["geometry"] = result.geometry.apply(
                lambda g: orthogonalize(g, angle_threshold=angle_threshold) if g.is_valid else g
            )
        elif method == "adaptive":
            from geoai.utils.geometry import adaptive_regularization

            result = gdf.copy()
            result["geometry"] = result.geometry.apply(
                lambda g: adaptive_regularization(g) if g.is_valid else g
            )
        else:
            logger.warning("Unknown regularization method %r, skipping", method)
            return gdf

        # Validate results
        result = result[result.geometry.is_valid & ~result.geometry.is_empty]
        logger.info("Regularized %d polygons (method=%s)", len(result), method)
        return result.reset_index(drop=True)

    except ImportError:
        logger.info(
            "geoai not installed — skipping regularization. "
            "Install with: pip install agribound[geoai]"
        )
        return gdf
    except Exception as exc:
        logger.warning("Regularization failed: %s — returning original polygons", exc)
        return gdf

simplify_polygons

simplify_polygons(gdf: GeoDataFrame, tolerance: float = 2.0, preserve_topology: bool = True) -> gpd.GeoDataFrame

Simplify field boundary polygons.

The tolerance is always in meters. If the input CRS is geographic (e.g. EPSG:4326), the data is temporarily projected to a local UTM zone for simplification.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input polygons.

required
tolerance float

Simplification tolerance in meters (default 2.0).

2.0
preserve_topology bool

If True, prevent polygon collapse and self-intersection.

True

Returns:

Type Description
GeoDataFrame

Simplified polygons.

Source code in agribound/postprocess/simplify.py
def simplify_polygons(
    gdf: gpd.GeoDataFrame,
    tolerance: float = 2.0,
    preserve_topology: bool = True,
) -> gpd.GeoDataFrame:
    """Simplify field boundary polygons.

    The tolerance is always in **meters**. If the input CRS is
    geographic (e.g. EPSG:4326), the data is temporarily projected to
    a local UTM zone for simplification.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Input polygons.
    tolerance : float
        Simplification tolerance in meters (default 2.0).
    preserve_topology : bool
        If *True*, prevent polygon collapse and self-intersection.

    Returns
    -------
    geopandas.GeoDataFrame
        Simplified polygons.
    """
    if len(gdf) == 0 or tolerance <= 0:
        return gdf

    result, original_crs = _to_metric(gdf)
    result = result.copy()
    result["geometry"] = result.geometry.simplify(tolerance, preserve_topology=preserve_topology)

    # Remove any geometries that became empty after simplification
    result = result[~result.geometry.is_empty]
    result = result[result.geometry.is_valid]

    n_removed = len(gdf) - len(result)
    if n_removed > 0:
        logger.info("Simplified polygons: %d removed due to collapse", n_removed)

    result = _to_original(result, original_crs)
    return result.reset_index(drop=True)