Pyteryx/alteryx_runner/tools/parse/text_to_columns.py

49 lines
1.8 KiB
Python

from __future__ import annotations
from typing import Dict
import polars as pl
from tools.base import BaseTool
class TextToColumnsTool(BaseTool):
def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
df = inputs.get("Input", pl.DataFrame())
if self.config is None or df.is_empty():
return {"Output": df}
field = self._cfg("Field", "") or ""
delimiter = self._cfg("Delimiter", ",") or ","
num_cols = int(self._cfg_attr("NumCols", "value", "2") or "2")
root_name = self._cfg("RootName", f"{field}_") or f"{field}_"
split_to_rows = (
self._cfg_attr("SplitToRows", "value", "False") or "False"
).lower() == "true"
if not field or field not in df.columns:
return {"Output": df}
col_str = df[field].cast(pl.String)
if split_to_rows:
rows_out: list[dict] = []
for row_dict in df.to_dicts():
val = str(row_dict.get(field) or "")
for token in val.split(delimiter):
new_row = dict(row_dict)
new_row[field] = token.strip()
rows_out.append(new_row)
return {"Output": pl.DataFrame(rows_out) if rows_out else df}
# Split to columns
split_series = col_str.str.splitn(delimiter, num_cols)
struct_df = split_series.struct.unnest()
# Polars names them field_0, field_1, ...
for i in range(num_cols):
pname = f"field_{i}"
out_name = f"{root_name}{i+1}"
if pname in struct_df.columns:
df = df.with_columns(struct_df[pname].alias(out_name))
else:
df = df.with_columns(pl.lit(None).cast(pl.String).alias(out_name))
return {"Output": df}