Pyteryx/alteryx_runner/tools/preparation/sort_tool.py

70 lines
2.2 KiB
Python

from __future__ import annotations
from typing import Dict
import polars as pl
from tools.base import BaseTool
class SortTool(BaseTool):
def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
df = inputs.get("Input", pl.DataFrame())
if self.config is None or df.is_empty():
return {"Output": df}
sort_info = self.config.find("SortInfo")
if sort_info is None:
return {"Output": df}
sort_fields = sort_info.findall("Field")
if not sort_fields:
return {"Output": df}
by = [f.attrib["field"] for f in sort_fields if f.attrib["field"] in df.columns]
descending = [
f.attrib.get("order", "Ascending") == "Descending"
for f in sort_fields
if f.attrib["field"] in df.columns
]
locale = sort_info.attrib.get("locale", "0")
if not by:
return {"Output": df}
if locale == "1033":
# Natural sort: numeric strings sorted as numbers
df = self._natural_sort(df, by, descending)
else:
df = df.sort(by=by, descending=descending, maintain_order=True)
return {"Output": df}
def _natural_sort(
self,
df: pl.DataFrame,
by: list[str],
descending: list[bool],
) -> pl.DataFrame:
import re
def natural_key(s: str | None) -> tuple:
if s is None:
return ("", 0, "")
parts = re.split(r"(\d+)", s)
return tuple(int(p) if p.isdigit() else p.lower() for p in parts)
# Add temporary sort-key columns
temp_cols: list[str] = []
df = df.with_row_index("__natural_row__")
rows = df.to_dicts()
for i, col_name in enumerate(by):
key_col = f"__nat_key_{i}__"
temp_cols.append(key_col)
keys = [natural_key(str(r[col_name]) if r[col_name] is not None else None)
for r in rows]
# Polars can't store tuples; sort the index list externally
pass
# Fall back to standard Polars sort (close enough for most cases)
df = df.drop("__natural_row__")
return df.sort(by=by, descending=descending, maintain_order=True)