Pyteryx/alteryx_runner/tools/inout/input_data.py

from __future__ import annotations
from typing import Dict, Optional
import xml.etree.ElementTree as ET
import polars as pl
from tools.base import BaseTool


class InputDataTool(BaseTool):
    def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]:
        if self.config is None:
            return {"Output": pl.DataFrame()}

        file_el = self.config.find("File")
        if file_el is None or not file_el.text:
            return {"Output": pl.DataFrame()}

        raw_path = (file_el.text or "").strip()
        fmt = int(file_el.attrib.get("FileFormat", "0"))
        record_limit_str = file_el.attrib.get("RecordLimit", "").strip()
        limit = int(record_limit_str) if record_limit_str else None

        opts = self.config.find("FormatSpecificOptions")
        if opts is None:
            opts = ET.Element("x")

        path_str, sheet = self._parse_path(raw_path)
        resolved = self.ctx.resolve_path(path_str)

        df = self._read(str(resolved), fmt, sheet, opts)

        # Trim whitespace from string columns (matches Alteryx behavior)
        for col in df.columns:
            if df[col].dtype == pl.String:
                df = df.with_columns(pl.col(col).str.strip_chars())

        if limit:
            df = df.head(limit)

        return {"Output": df}

    def _parse_path(self, raw: str) -> tuple[str, Optional[str]]:
        if "|||" in raw:
            path, sheet = raw.split("|||", 1)
            return path.strip(), sheet.strip().strip("`").rstrip("$")
        return raw.strip(), None

    def _read(
        self,
        path: str,
        fmt: int,
        sheet: Optional[str],
        opts: ET.Element,
    ) -> pl.DataFrame:
        if fmt in (0, 6):   # CSV / delimited
            delim = opts.findtext("Delimeter") or opts.findtext("Delimiter") or ","
            header_text = opts.findtext("HeaderRow") or "True"
            has_header = header_text.strip().lower() in ("true", "1", "yes")
            import_line = int(opts.findtext("ImportLine") or "1")
            skip = max(0, import_line - 1)
            return pl.read_csv(
                path,
                separator=delim,
                has_header=has_header,
                skip_rows=skip,
                infer_schema_length=10000,
                ignore_errors=True,
            )

        if fmt == 25:   # Excel
            read_header = (opts.findtext("FirstRowData") or "False").lower() != "true"
            import_line = int(opts.findtext("ImportLine") or "1")
            skip = max(0, import_line - 1)
            return pl.read_excel(
                path,
                sheet_name=sheet or 0,
                read_options={"has_header": read_header, "skip_rows": skip},
            )

        if fmt == 2:    # Parquet
            return pl.read_parquet(path)

        if fmt == 19:   # YXDB
            try:
                import yxdb
                reader = yxdb.open_file(path)
                rows = list(reader)
                if rows:
                    return pl.DataFrame(rows)
                return pl.DataFrame()
            except ImportError:
                raise NotImplementedError(
                    "YXDB format requires the 'yxdb' package: pip install yxdb"
                )

        if fmt == 56:   # JSON
            return pl.read_json(path)

        # Fallback: try CSV
        return pl.read_csv(path, infer_schema_length=10000, ignore_errors=True)