49 lines
1.8 KiB
Python
49 lines
1.8 KiB
Python
from __future__ import annotations
|
|
from typing import Dict
|
|
import polars as pl
|
|
from tools.base import BaseTool
|
|
|
|
|
|
class TextToColumnsTool(BaseTool):
|
|
def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
|
|
df = inputs.get("Input", pl.DataFrame())
|
|
if self.config is None or df.is_empty():
|
|
return {"Output": df}
|
|
|
|
field = self._cfg("Field", "") or ""
|
|
delimiter = self._cfg("Delimiter", ",") or ","
|
|
num_cols = int(self._cfg_attr("NumCols", "value", "2") or "2")
|
|
root_name = self._cfg("RootName", f"{field}_") or f"{field}_"
|
|
split_to_rows = (
|
|
self._cfg_attr("SplitToRows", "value", "False") or "False"
|
|
).lower() == "true"
|
|
|
|
if not field or field not in df.columns:
|
|
return {"Output": df}
|
|
|
|
col_str = df[field].cast(pl.String)
|
|
|
|
if split_to_rows:
|
|
rows_out: list[dict] = []
|
|
for row_dict in df.to_dicts():
|
|
val = str(row_dict.get(field) or "")
|
|
for token in val.split(delimiter):
|
|
new_row = dict(row_dict)
|
|
new_row[field] = token.strip()
|
|
rows_out.append(new_row)
|
|
return {"Output": pl.DataFrame(rows_out) if rows_out else df}
|
|
|
|
# Split to columns
|
|
split_series = col_str.str.splitn(delimiter, num_cols)
|
|
struct_df = split_series.struct.unnest()
|
|
# Polars names them field_0, field_1, ...
|
|
for i in range(num_cols):
|
|
pname = f"field_{i}"
|
|
out_name = f"{root_name}{i+1}"
|
|
if pname in struct_df.columns:
|
|
df = df.with_columns(struct_df[pname].alias(out_name))
|
|
else:
|
|
df = df.with_columns(pl.lit(None).cast(pl.String).alias(out_name))
|
|
|
|
return {"Output": df}
|