""" Alteryx expression → DuckDB SQL transpiler. Handles: [ColumnName] → "ColumnName" "string" → 'string' (double → single quotes) IF...THEN...ENDIF → CASE WHEN...END IIF(c,t,f) → CASE WHEN c THEN t ELSE f END IsNull/IsEmpty → IS NULL checks NULL() → NULL AND/OR/NOT → AND/OR/NOT == / != → = / <> Row references → not supported in SQL mode (raises) All functions in expression/functions.py """ from __future__ import annotations import re from enum import Enum, auto from typing import Optional import polars as pl import duckdb from .functions import get_function_sql, titlecase_sql class UnsupportedExpressionError(Exception): pass # --------------------------------------------------------------------------- # Tokeniser # --------------------------------------------------------------------------- class TT(Enum): LBRACKET = auto() # [ RBRACKET = auto() # ] LPAREN = auto() # ( RPAREN = auto() # ) COMMA = auto() # , PLUS = auto() # + MINUS = auto() # - STAR = auto() # * SLASH = auto() # / PERCENT = auto() # % CONCAT = auto() # + (string, same as PLUS — resolved by context) EQ = auto() # == or = NEQ = auto() # != or <> LT = auto() # < LE = auto() # <= GT = auto() # > GE = auto() # >= AND = auto() OR = auto() NOT = auto() IF = auto() THEN = auto() ELSEIF = auto() ELSE = auto() ENDIF = auto() IIF = auto() NULL_FUNC = auto() # NULL() ISNULL = auto() ISEMPTY = auto() NUMBER = auto() STRING = auto() # double-quoted string literal IDENT = auto() # function name or keyword COLUMN = auto() # [ColName] — after stripping brackets EOF = auto() BANG = auto() # ! (prefix not) PIPE2 = auto() # || (string concat in SQL) POWER = auto() # ^ _KEYWORDS = { "AND": TT.AND, "OR": TT.OR, "NOT": TT.NOT, "IF": TT.IF, "THEN": TT.THEN, "ELSEIF": TT.ELSEIF, "ELSE": TT.ELSE, "ENDIF": TT.ENDIF, "IIF": TT.IIF, "NULL": TT.NULL_FUNC, "ISNULL": TT.ISNULL, "ISEMPTY": TT.ISEMPTY, "ISNUMBER": TT.IDENT, # keep as IDENT, handled in primary "TRUE": TT.IDENT, "FALSE": TT.IDENT, } class Token: __slots__ = ("type", "value") def __init__(self, type_: TT, value: object = None): self.type = type_ self.value = value def __repr__(self): return f"Token({self.type}, {self.value!r})" _TOKEN_RE = re.compile( r""" (?P\s+) | (?P\[[^\]]*\]) | (?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?) | (?P"(?:[^"\\]|\\.)*") | (?P<=) | (?P>=) | (?P!=|<>) | (?P==|=) | (?P<) | (?P>) | (?P\|\|) | (?P\+) | (?P-) | (?P\*) | (?P/) | (?P%) | (?P\^) | (?P!) | (?P\() | (?P\)) | (?P,) | (?P[A-Za-z_]\w*) """, re.VERBOSE, ) def tokenise(text: str) -> list[Token]: tokens: list[Token] = [] pos = 0 while pos < len(text): m = _TOKEN_RE.match(text, pos) if not m: raise UnsupportedExpressionError( f"Unexpected character {text[pos]!r} at pos {pos} in: {text!r}" ) pos = m.end() kind = m.lastgroup raw = m.group() if kind == "SPACE": continue if kind == "COLUMN": tokens.append(Token(TT.COLUMN, raw[1:-1])) # strip [ ] elif kind == "NUMBER": tokens.append(Token(TT.NUMBER, raw)) elif kind == "STRING": # Convert double-quoted Alteryx string to single-quoted SQL inner = raw[1:-1].replace("\\'", "'").replace("'", "''").replace('\\"', '"') tokens.append(Token(TT.STRING, inner)) elif kind == "LE": tokens.append(Token(TT.LE)) elif kind == "GE": tokens.append(Token(TT.GE)) elif kind == "NEQ": tokens.append(Token(TT.NEQ)) elif kind == "EQ": tokens.append(Token(TT.EQ)) elif kind == "LT": tokens.append(Token(TT.LT)) elif kind == "GT": tokens.append(Token(TT.GT)) elif kind == "PIPE2": tokens.append(Token(TT.PIPE2)) elif kind == "CONCAT": tokens.append(Token(TT.PLUS)) elif kind == "MINUS": tokens.append(Token(TT.MINUS)) elif kind == "STAR": tokens.append(Token(TT.STAR)) elif kind == "SLASH": tokens.append(Token(TT.SLASH)) elif kind == "PERCENT": tokens.append(Token(TT.PERCENT)) elif kind == "POWER": tokens.append(Token(TT.POWER)) elif kind == "BANG": tokens.append(Token(TT.BANG)) elif kind == "LPAREN": tokens.append(Token(TT.LPAREN)) elif kind == "RPAREN": tokens.append(Token(TT.RPAREN)) elif kind == "COMMA": tokens.append(Token(TT.COMMA)) elif kind == "IDENT": upper = raw.upper() tt = _KEYWORDS.get(upper, TT.IDENT) tokens.append(Token(tt, raw)) else: raise UnsupportedExpressionError(f"Unhandled token kind {kind}") tokens.append(Token(TT.EOF)) return tokens # --------------------------------------------------------------------------- # Parser / code generator (recursive descent → DuckDB SQL string) # --------------------------------------------------------------------------- class _Parser: def __init__(self, tokens: list[Token]): self._tokens = tokens self._pos = 0 @property def _cur(self) -> Token: return self._tokens[self._pos] def _peek(self, offset: int = 1) -> Token: idx = self._pos + offset if idx >= len(self._tokens): return Token(TT.EOF) return self._tokens[idx] def _advance(self) -> Token: tok = self._tokens[self._pos] self._pos += 1 return tok def _expect(self, tt: TT) -> Token: tok = self._advance() if tok.type != tt: raise UnsupportedExpressionError( f"Expected {tt}, got {tok.type} ({tok.value!r})" ) return tok # ------------------------------------------------------------------ # def parse(self) -> str: sql = self._parse_expr() if self._cur.type != TT.EOF: raise UnsupportedExpressionError( f"Unexpected token at end: {self._cur}" ) return sql def _parse_expr(self) -> str: return self._parse_or() def _parse_or(self) -> str: left = self._parse_and() while self._cur.type == TT.OR: self._advance() right = self._parse_and() left = f"({left} OR {right})" return left def _parse_and(self) -> str: left = self._parse_not() while self._cur.type == TT.AND: self._advance() right = self._parse_not() left = f"({left} AND {right})" return left def _parse_not(self) -> str: if self._cur.type in (TT.NOT, TT.BANG): self._advance() operand = self._parse_not() return f"(NOT {operand})" return self._parse_comparison() def _parse_comparison(self) -> str: left = self._parse_additive() cmp_map = { TT.EQ: "=", TT.NEQ: "<>", TT.LT: "<", TT.LE: "<=", TT.GT: ">", TT.GE: ">=", } if self._cur.type in cmp_map: op = cmp_map[self._advance().type] right = self._parse_additive() return f"({left} {op} {right})" return left def _parse_additive(self) -> str: left = self._parse_multiplicative() while self._cur.type in (TT.PLUS, TT.MINUS, TT.PIPE2): op = self._advance() right = self._parse_multiplicative() if op.type == TT.PIPE2: left = f"({left} || {right})" elif op.type == TT.MINUS: left = f"({left} - {right})" else: left = f"({left} + {right})" return left def _parse_multiplicative(self) -> str: left = self._parse_unary() while self._cur.type in (TT.STAR, TT.SLASH, TT.PERCENT, TT.POWER): op = self._advance() right = self._parse_unary() if op.type == TT.POWER: left = f"POWER({left}, {right})" elif op.type == TT.PERCENT: left = f"({left} % {right})" elif op.type == TT.SLASH: left = f"({left} / {right})" else: left = f"({left} * {right})" return left def _parse_unary(self) -> str: if self._cur.type == TT.MINUS: self._advance() return f"(-{self._parse_primary()})" if self._cur.type == TT.PLUS: self._advance() return self._parse_primary() return self._parse_primary() def _parse_primary(self) -> str: # noqa: C901 (complexity ok for parser) tok = self._cur # Parenthesised sub-expression if tok.type == TT.LPAREN: self._advance() inner = self._parse_expr() self._expect(TT.RPAREN) return f"({inner})" # Column reference if tok.type == TT.COLUMN: self._advance() # Row reference [Row-N:Field] or [Row+N:Field] col = tok.value row_m = re.match(r"^Row([+-]\d+):(.+)$", col, re.IGNORECASE) if row_m: offset = int(row_m.group(1)) field = row_m.group(2) func = "LAG" if offset < 0 else "LEAD" return f'{func}("{field}", {abs(offset)}) OVER ()' return f'"{col}"' # Numeric literal if tok.type == TT.NUMBER: self._advance() return tok.value # String literal (already converted to single-quoted) if tok.type == TT.STRING: self._advance() return f"'{tok.value}'" # IF … THEN … [ELSEIF … THEN …]* [ELSE …] ENDIF if tok.type == TT.IF: return self._parse_if() # NULL() or bare NULL keyword if tok.type == TT.NULL_FUNC: self._advance() if self._cur.type == TT.LPAREN: self._advance() self._expect(TT.RPAREN) return "NULL" # IsNull([F]) — keyword form if tok.type == TT.ISNULL: self._advance() self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return f"({inner} IS NULL)" # IsEmpty([F]) — keyword form if tok.type == TT.ISEMPTY: self._advance() self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return f"({inner} IS NULL OR {inner} = '')" # Function call or bare identifier if tok.type == TT.IDENT: name = tok.value upper = name.upper() self._advance() # Bare boolean/null literals if upper == "TRUE": return "TRUE" if upper == "FALSE": return "FALSE" if upper == "NULL": if self._cur.type == TT.LPAREN: self._advance() self._expect(TT.RPAREN) return "NULL" # IsNull / IsEmpty used as plain identifiers (case variations) if upper == "ISNULL": self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return f"({inner} IS NULL)" if upper == "ISEMPTY": self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return f"({inner} IS NULL OR {inner} = '')" if upper == "ISNUMBER": self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return f"(TRY_CAST({inner} AS DOUBLE) IS NOT NULL)" # Titlecase — special SQL rendering if upper == "TITLECASE": self._expect(TT.LPAREN) inner = self._parse_expr() self._expect(TT.RPAREN) return titlecase_sql(inner) # DateTimeAdd / DateTimeDiff need string arg unquoted for INTERVAL if upper == "DATETIMEADD": self._expect(TT.LPAREN) d_arg = self._parse_expr() self._expect(TT.COMMA) n_arg = self._parse_expr() self._expect(TT.COMMA) unit_arg = self._parse_expr() self._expect(TT.RPAREN) # unit_arg is a SQL string like 'days' — strip quotes for INTERVAL keyword unit = unit_arg.strip("'").rstrip("s").upper() return f"({d_arg} + INTERVAL ({n_arg}) {unit})" if upper == "DATETIMEDIFF": self._expect(TT.LPAREN) d1 = self._parse_expr() self._expect(TT.COMMA) d2 = self._parse_expr() self._expect(TT.COMMA) unit_arg = self._parse_expr() self._expect(TT.RPAREN) unit = unit_arg.strip("'").rstrip("s").upper() return f"DATEDIFF('{unit}', {d2}, {d1})" # IIF as identifier (keyword token is TT.IIF but may arrive as IDENT) if upper == "IIF": self._expect(TT.LPAREN) cond = self._parse_expr() self._expect(TT.COMMA) true_val = self._parse_expr() self._expect(TT.COMMA) false_val = self._parse_expr() self._expect(TT.RPAREN) return f"(CASE WHEN {cond} THEN {true_val} ELSE {false_val} END)" if self._cur.type == TT.LPAREN: # Function call self._advance() args: list[str] = [] if self._cur.type != TT.RPAREN: args.append(self._parse_expr()) while self._cur.type == TT.COMMA: self._advance() args.append(self._parse_expr()) self._expect(TT.RPAREN) return get_function_sql(name, args) # Bare identifier (e.g. a column name without brackets — unusual) return f'"{name}"' # IIF keyword token if tok.type == TT.IIF: self._advance() self._expect(TT.LPAREN) cond = self._parse_expr() self._expect(TT.COMMA) true_val = self._parse_expr() self._expect(TT.COMMA) false_val = self._parse_expr() self._expect(TT.RPAREN) return f"(CASE WHEN {cond} THEN {true_val} ELSE {false_val} END)" raise UnsupportedExpressionError(f"Unexpected token: {tok}") def _parse_if(self) -> str: self._expect(TT.IF) branches: list[tuple[str, str]] = [] else_val: Optional[str] = None cond = self._parse_expr() self._expect(TT.THEN) val = self._parse_expr() branches.append((cond, val)) while self._cur.type == TT.ELSEIF: self._advance() cond = self._parse_expr() self._expect(TT.THEN) val = self._parse_expr() branches.append((cond, val)) if self._cur.type == TT.ELSE: self._advance() else_val = self._parse_expr() self._expect(TT.ENDIF) parts = ["CASE"] for cond, val in branches: parts.append(f"WHEN {cond} THEN {val}") if else_val is not None: parts.append(f"ELSE {else_val}") parts.append("END") return " ".join(parts) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def transpile(expression: str) -> str: """Convert an Alteryx expression string to a DuckDB SQL fragment.""" expression = expression.strip() if not expression: return "NULL" tokens = tokenise(expression) return _Parser(tokens).parse() def _coerce_numeric_strings(df: pl.DataFrame) -> pl.DataFrame: """Cast string columns that contain only numeric data to Int64 or Float64. Alteryx implicitly coerces TextInput strings to numbers when the expression treats them numerically. This mirrors that behaviour. """ casts: list[pl.Expr] = [] for col_name in df.columns: s = df[col_name] if s.dtype != pl.String: continue non_null = s.drop_nulls() if len(non_null) == 0: continue # Try integer first (covers integer-looking strings) int_s = non_null.cast(pl.Int64, strict=False) if int_s.null_count() == 0: casts.append(pl.col(col_name).cast(pl.Int64, strict=False)) continue # Try float float_s = non_null.cast(pl.Float64, strict=False) if float_s.null_count() == 0: casts.append(pl.col(col_name).cast(pl.Float64, strict=False)) return df.with_columns(casts) if casts else df class ExpressionTranspiler: """Stateful transpiler bound to a DuckDB connection for evaluation.""" def __init__(self, con: duckdb.DuckDBPyConnection): self._con = con self._view_counter = 0 def _register(self, df: pl.DataFrame) -> str: name = f"_expr_df_{self._view_counter}" self._view_counter += 1 self._con.register(name, df.to_arrow()) return name def eval_mask(self, df: pl.DataFrame, expression: str) -> pl.Series: """Evaluate a boolean Alteryx expression against df, return bool Series.""" sql_expr = transpile(expression) view = self._register(df) try: result = self._con.execute( f'SELECT ({sql_expr}) AS _mask FROM "{view}"' ).pl() return result["_mask"] except duckdb.BinderException: # Type mismatch: retry after coercing numeric-looking string columns self._con.execute(f'DROP VIEW IF EXISTS "{view}"') df2 = _coerce_numeric_strings(df) view = self._register(df2) result = self._con.execute( f'SELECT ({sql_expr}) AS _mask FROM "{view}"' ).pl() return result["_mask"] finally: self._con.execute(f'DROP VIEW IF EXISTS "{view}"') def eval_series( self, df: pl.DataFrame, expression: str, field: str, dtype: pl.PolarsDataType, ) -> pl.Series: """Evaluate a scalar Alteryx expression against df, return a Series.""" sql_expr = transpile(expression) view = self._register(df) try: result = self._con.execute( f'SELECT ({sql_expr}) AS "{field}" FROM "{view}"' ).pl() series = result[field] try: return series.cast(dtype) except Exception: return series except duckdb.BinderException: self._con.execute(f'DROP VIEW IF EXISTS "{view}"') df2 = _coerce_numeric_strings(df) view = self._register(df2) result = self._con.execute( f'SELECT ({sql_expr}) AS "{field}" FROM "{view}"' ).pl() series = result[field] try: return series.cast(dtype) except Exception: return series finally: self._con.execute(f'DROP VIEW IF EXISTS "{view}"') def eval_scalar(self, expression: str) -> object: """Evaluate an expression that requires no input columns.""" sql_expr = transpile(expression) result = self._con.execute(f"SELECT ({sql_expr})").fetchone() return result[0] if result else None