from __future__ import annotations from typing import Dict, Optional import xml.etree.ElementTree as ET import polars as pl from tools.base import BaseTool class InputDataTool(BaseTool): def execute(self, inputs: Dict[str, pl.DataFrame]) -> Dict[str, pl.DataFrame]: if self.config is None: return {"Output": pl.DataFrame()} file_el = self.config.find("File") if file_el is None or not file_el.text: return {"Output": pl.DataFrame()} raw_path = (file_el.text or "").strip() fmt = int(file_el.attrib.get("FileFormat", "0")) record_limit_str = file_el.attrib.get("RecordLimit", "").strip() limit = int(record_limit_str) if record_limit_str else None opts = self.config.find("FormatSpecificOptions") if opts is None: opts = ET.Element("x") path_str, sheet = self._parse_path(raw_path) resolved = self.ctx.resolve_path(path_str) df = self._read(str(resolved), fmt, sheet, opts) # Trim whitespace from string columns (matches Alteryx behavior) for col in df.columns: if df[col].dtype == pl.String: df = df.with_columns(pl.col(col).str.strip_chars()) if limit: df = df.head(limit) return {"Output": df} def _parse_path(self, raw: str) -> tuple[str, Optional[str]]: if "|||" in raw: path, sheet = raw.split("|||", 1) return path.strip(), sheet.strip().strip("`").rstrip("$") return raw.strip(), None def _read( self, path: str, fmt: int, sheet: Optional[str], opts: ET.Element, ) -> pl.DataFrame: if fmt in (0, 6): # CSV / delimited delim = opts.findtext("Delimeter") or opts.findtext("Delimiter") or "," header_text = opts.findtext("HeaderRow") or "True" has_header = header_text.strip().lower() in ("true", "1", "yes") import_line = int(opts.findtext("ImportLine") or "1") skip = max(0, import_line - 1) return pl.read_csv( path, separator=delim, has_header=has_header, skip_rows=skip, infer_schema_length=10000, ignore_errors=True, ) if fmt == 25: # Excel read_header = (opts.findtext("FirstRowData") or "False").lower() != "true" import_line = int(opts.findtext("ImportLine") or "1") skip = max(0, import_line - 1) return pl.read_excel( path, sheet_name=sheet or 0, read_options={"has_header": read_header, "skip_rows": skip}, ) if fmt == 2: # Parquet return pl.read_parquet(path) if fmt == 19: # YXDB try: import yxdb reader = yxdb.open_file(path) rows = list(reader) if rows: return pl.DataFrame(rows) return pl.DataFrame() except ImportError: raise NotImplementedError( "YXDB format requires the 'yxdb' package: pip install yxdb" ) if fmt == 56: # JSON return pl.read_json(path) # Fallback: try CSV return pl.read_csv(path, infer_schema_length=10000, ignore_errors=True)