Pyteryx/alteryx_runner/tools/preparation/unique_tool.py

40 lines
1.2 KiB
Python

from __future__ import annotations
from typing import Dict
import polars as pl
from tools.base import BaseTool
class UniqueTool(BaseTool):
def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
df = inputs.get("Input", pl.DataFrame())
if self.config is None or df.is_empty():
return {"Unique": df, "Duplicate": pl.DataFrame()}
key_fields = [
f.attrib["field"]
for f in self.config.findall("UniqueFields/Field")
if f.attrib.get("field", "") in df.columns
]
if not key_fields:
key_fields = df.columns
df = df.with_row_index("__row_idx__")
first_idx_list = (
df.group_by(key_fields, maintain_order=True)
.agg(pl.col("__row_idx__").first())
["__row_idx__"]
.to_list()
)
unique_df = (
df.filter(pl.col("__row_idx__").is_in(first_idx_list))
.drop("__row_idx__")
.sort(key_fields)
)
dup_df = (
df.filter(~pl.col("__row_idx__").is_in(first_idx_list))
.drop("__row_idx__")
.sort(key_fields)
)
return {"Unique": unique_df, "Duplicates": dup_df}