40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
from __future__ import annotations
|
|
from typing import Dict
|
|
import polars as pl
|
|
from tools.base import BaseTool
|
|
|
|
|
|
class UniqueTool(BaseTool):
|
|
def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
|
|
df = inputs.get("Input", pl.DataFrame())
|
|
if self.config is None or df.is_empty():
|
|
return {"Unique": df, "Duplicate": pl.DataFrame()}
|
|
|
|
key_fields = [
|
|
f.attrib["field"]
|
|
for f in self.config.findall("UniqueFields/Field")
|
|
if f.attrib.get("field", "") in df.columns
|
|
]
|
|
if not key_fields:
|
|
key_fields = df.columns
|
|
|
|
df = df.with_row_index("__row_idx__")
|
|
first_idx_list = (
|
|
df.group_by(key_fields, maintain_order=True)
|
|
.agg(pl.col("__row_idx__").first())
|
|
["__row_idx__"]
|
|
.to_list()
|
|
)
|
|
|
|
unique_df = (
|
|
df.filter(pl.col("__row_idx__").is_in(first_idx_list))
|
|
.drop("__row_idx__")
|
|
.sort(key_fields)
|
|
)
|
|
dup_df = (
|
|
df.filter(~pl.col("__row_idx__").is_in(first_idx_list))
|
|
.drop("__row_idx__")
|
|
.sort(key_fields)
|
|
)
|
|
return {"Unique": unique_df, "Duplicates": dup_df}
|