Pyteryx/alteryx_runner/expression/transpiler.py

631 lines
20 KiB
Python

"""
Alteryx expression → DuckDB SQL transpiler.
Handles:
[ColumnName] → "ColumnName"
"string"'string' (double → single quotes)
IF...THEN...ENDIF → CASE WHEN...END
IIF(c,t,f) → CASE WHEN c THEN t ELSE f END
IsNull/IsEmpty → IS NULL checks
NULL() → NULL
AND/OR/NOT → AND/OR/NOT
== / != → = / <>
Row references → not supported in SQL mode (raises)
All functions in expression/functions.py
"""
from __future__ import annotations
import re
from enum import Enum, auto
from typing import Optional
import polars as pl
import duckdb
from .functions import get_function_sql, titlecase_sql
class UnsupportedExpressionError(Exception):
pass
# ---------------------------------------------------------------------------
# Tokeniser
# ---------------------------------------------------------------------------
class TT(Enum):
LBRACKET = auto() # [
RBRACKET = auto() # ]
LPAREN = auto() # (
RPAREN = auto() # )
COMMA = auto() # ,
PLUS = auto() # +
MINUS = auto() # -
STAR = auto() # *
SLASH = auto() # /
PERCENT = auto() # %
CONCAT = auto() # + (string, same as PLUS — resolved by context)
EQ = auto() # == or =
NEQ = auto() # != or <>
LT = auto() # <
LE = auto() # <=
GT = auto() # >
GE = auto() # >=
AND = auto()
OR = auto()
NOT = auto()
IF = auto()
THEN = auto()
ELSEIF = auto()
ELSE = auto()
ENDIF = auto()
IIF = auto()
NULL_FUNC = auto() # NULL()
ISNULL = auto()
ISEMPTY = auto()
NUMBER = auto()
STRING = auto() # double-quoted string literal
IDENT = auto() # function name or keyword
COLUMN = auto() # [ColName] — after stripping brackets
EOF = auto()
BANG = auto() # ! (prefix not)
PIPE2 = auto() # || (string concat in SQL)
POWER = auto() # ^
_KEYWORDS = {
"AND": TT.AND,
"OR": TT.OR,
"NOT": TT.NOT,
"IF": TT.IF,
"THEN": TT.THEN,
"ELSEIF": TT.ELSEIF,
"ELSE": TT.ELSE,
"ENDIF": TT.ENDIF,
"IIF": TT.IIF,
"NULL": TT.NULL_FUNC,
"ISNULL": TT.ISNULL,
"ISEMPTY": TT.ISEMPTY,
"ISNUMBER": TT.IDENT, # keep as IDENT, handled in primary
"TRUE": TT.IDENT,
"FALSE": TT.IDENT,
}
class Token:
__slots__ = ("type", "value")
def __init__(self, type_: TT, value: object = None):
self.type = type_
self.value = value
def __repr__(self):
return f"Token({self.type}, {self.value!r})"
_TOKEN_RE = re.compile(
r"""
(?P<SPACE>\s+)
| (?P<COLUMN>\[[^\]]*\])
| (?P<NUMBER>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)
| (?P<STRING>"(?:[^"\\]|\\.)*")
| (?P<LE><=)
| (?P<GE>>=)
| (?P<NEQ>!=|<>)
| (?P<EQ>==|=)
| (?P<LT><)
| (?P<GT>>)
| (?P<PIPE2>\|\|)
| (?P<CONCAT>\+)
| (?P<MINUS>-)
| (?P<STAR>\*)
| (?P<SLASH>/)
| (?P<PERCENT>%)
| (?P<POWER>\^)
| (?P<BANG>!)
| (?P<LPAREN>\()
| (?P<RPAREN>\))
| (?P<COMMA>,)
| (?P<IDENT>[A-Za-z_]\w*)
""",
re.VERBOSE,
)
def tokenise(text: str) -> list[Token]:
tokens: list[Token] = []
pos = 0
while pos < len(text):
m = _TOKEN_RE.match(text, pos)
if not m:
raise UnsupportedExpressionError(
f"Unexpected character {text[pos]!r} at pos {pos} in: {text!r}"
)
pos = m.end()
kind = m.lastgroup
raw = m.group()
if kind == "SPACE":
continue
if kind == "COLUMN":
tokens.append(Token(TT.COLUMN, raw[1:-1])) # strip [ ]
elif kind == "NUMBER":
tokens.append(Token(TT.NUMBER, raw))
elif kind == "STRING":
# Convert double-quoted Alteryx string to single-quoted SQL
inner = raw[1:-1].replace("\\'", "'").replace("'", "''").replace('\\"', '"')
tokens.append(Token(TT.STRING, inner))
elif kind == "LE":
tokens.append(Token(TT.LE))
elif kind == "GE":
tokens.append(Token(TT.GE))
elif kind == "NEQ":
tokens.append(Token(TT.NEQ))
elif kind == "EQ":
tokens.append(Token(TT.EQ))
elif kind == "LT":
tokens.append(Token(TT.LT))
elif kind == "GT":
tokens.append(Token(TT.GT))
elif kind == "PIPE2":
tokens.append(Token(TT.PIPE2))
elif kind == "CONCAT":
tokens.append(Token(TT.PLUS))
elif kind == "MINUS":
tokens.append(Token(TT.MINUS))
elif kind == "STAR":
tokens.append(Token(TT.STAR))
elif kind == "SLASH":
tokens.append(Token(TT.SLASH))
elif kind == "PERCENT":
tokens.append(Token(TT.PERCENT))
elif kind == "POWER":
tokens.append(Token(TT.POWER))
elif kind == "BANG":
tokens.append(Token(TT.BANG))
elif kind == "LPAREN":
tokens.append(Token(TT.LPAREN))
elif kind == "RPAREN":
tokens.append(Token(TT.RPAREN))
elif kind == "COMMA":
tokens.append(Token(TT.COMMA))
elif kind == "IDENT":
upper = raw.upper()
tt = _KEYWORDS.get(upper, TT.IDENT)
tokens.append(Token(tt, raw))
else:
raise UnsupportedExpressionError(f"Unhandled token kind {kind}")
tokens.append(Token(TT.EOF))
return tokens
# ---------------------------------------------------------------------------
# Parser / code generator (recursive descent → DuckDB SQL string)
# ---------------------------------------------------------------------------
class _Parser:
def __init__(self, tokens: list[Token]):
self._tokens = tokens
self._pos = 0
@property
def _cur(self) -> Token:
return self._tokens[self._pos]
def _peek(self, offset: int = 1) -> Token:
idx = self._pos + offset
if idx >= len(self._tokens):
return Token(TT.EOF)
return self._tokens[idx]
def _advance(self) -> Token:
tok = self._tokens[self._pos]
self._pos += 1
return tok
def _expect(self, tt: TT) -> Token:
tok = self._advance()
if tok.type != tt:
raise UnsupportedExpressionError(
f"Expected {tt}, got {tok.type} ({tok.value!r})"
)
return tok
# ------------------------------------------------------------------ #
def parse(self) -> str:
sql = self._parse_expr()
if self._cur.type != TT.EOF:
raise UnsupportedExpressionError(
f"Unexpected token at end: {self._cur}"
)
return sql
def _parse_expr(self) -> str:
return self._parse_or()
def _parse_or(self) -> str:
left = self._parse_and()
while self._cur.type == TT.OR:
self._advance()
right = self._parse_and()
left = f"({left} OR {right})"
return left
def _parse_and(self) -> str:
left = self._parse_not()
while self._cur.type == TT.AND:
self._advance()
right = self._parse_not()
left = f"({left} AND {right})"
return left
def _parse_not(self) -> str:
if self._cur.type in (TT.NOT, TT.BANG):
self._advance()
operand = self._parse_not()
return f"(NOT {operand})"
return self._parse_comparison()
def _parse_comparison(self) -> str:
left = self._parse_additive()
cmp_map = {
TT.EQ: "=",
TT.NEQ: "<>",
TT.LT: "<",
TT.LE: "<=",
TT.GT: ">",
TT.GE: ">=",
}
if self._cur.type in cmp_map:
op = cmp_map[self._advance().type]
right = self._parse_additive()
return f"({left} {op} {right})"
return left
def _parse_additive(self) -> str:
left = self._parse_multiplicative()
while self._cur.type in (TT.PLUS, TT.MINUS, TT.PIPE2):
op = self._advance()
right = self._parse_multiplicative()
if op.type == TT.PIPE2:
left = f"({left} || {right})"
elif op.type == TT.MINUS:
left = f"({left} - {right})"
else:
left = f"({left} + {right})"
return left
def _parse_multiplicative(self) -> str:
left = self._parse_unary()
while self._cur.type in (TT.STAR, TT.SLASH, TT.PERCENT, TT.POWER):
op = self._advance()
right = self._parse_unary()
if op.type == TT.POWER:
left = f"POWER({left}, {right})"
elif op.type == TT.PERCENT:
left = f"({left} % {right})"
elif op.type == TT.SLASH:
left = f"({left} / {right})"
else:
left = f"({left} * {right})"
return left
def _parse_unary(self) -> str:
if self._cur.type == TT.MINUS:
self._advance()
return f"(-{self._parse_primary()})"
if self._cur.type == TT.PLUS:
self._advance()
return self._parse_primary()
return self._parse_primary()
def _parse_primary(self) -> str: # noqa: C901 (complexity ok for parser)
tok = self._cur
# Parenthesised sub-expression
if tok.type == TT.LPAREN:
self._advance()
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"({inner})"
# Column reference
if tok.type == TT.COLUMN:
self._advance()
# Row reference [Row-N:Field] or [Row+N:Field]
col = tok.value
row_m = re.match(r"^Row([+-]\d+):(.+)$", col, re.IGNORECASE)
if row_m:
offset = int(row_m.group(1))
field = row_m.group(2)
func = "LAG" if offset < 0 else "LEAD"
return f'{func}("{field}", {abs(offset)}) OVER ()'
return f'"{col}"'
# Numeric literal
if tok.type == TT.NUMBER:
self._advance()
return tok.value
# String literal (already converted to single-quoted)
if tok.type == TT.STRING:
self._advance()
return f"'{tok.value}'"
# IF … THEN … [ELSEIF … THEN …]* [ELSE …] ENDIF
if tok.type == TT.IF:
return self._parse_if()
# NULL() or bare NULL keyword
if tok.type == TT.NULL_FUNC:
self._advance()
if self._cur.type == TT.LPAREN:
self._advance()
self._expect(TT.RPAREN)
return "NULL"
# IsNull([F]) — keyword form
if tok.type == TT.ISNULL:
self._advance()
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"({inner} IS NULL)"
# IsEmpty([F]) — keyword form
if tok.type == TT.ISEMPTY:
self._advance()
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"({inner} IS NULL OR {inner} = '')"
# Function call or bare identifier
if tok.type == TT.IDENT:
name = tok.value
upper = name.upper()
self._advance()
# Bare boolean/null literals
if upper == "TRUE":
return "TRUE"
if upper == "FALSE":
return "FALSE"
if upper == "NULL":
if self._cur.type == TT.LPAREN:
self._advance()
self._expect(TT.RPAREN)
return "NULL"
# IsNull / IsEmpty used as plain identifiers (case variations)
if upper == "ISNULL":
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"({inner} IS NULL)"
if upper == "ISEMPTY":
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"({inner} IS NULL OR {inner} = '')"
if upper == "ISNUMBER":
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return f"(TRY_CAST({inner} AS DOUBLE) IS NOT NULL)"
# Titlecase — special SQL rendering
if upper == "TITLECASE":
self._expect(TT.LPAREN)
inner = self._parse_expr()
self._expect(TT.RPAREN)
return titlecase_sql(inner)
# DateTimeAdd / DateTimeDiff need string arg unquoted for INTERVAL
if upper == "DATETIMEADD":
self._expect(TT.LPAREN)
d_arg = self._parse_expr()
self._expect(TT.COMMA)
n_arg = self._parse_expr()
self._expect(TT.COMMA)
unit_arg = self._parse_expr()
self._expect(TT.RPAREN)
# unit_arg is a SQL string like 'days' — strip quotes for INTERVAL keyword
unit = unit_arg.strip("'").rstrip("s").upper()
return f"({d_arg} + INTERVAL ({n_arg}) {unit})"
if upper == "DATETIMEDIFF":
self._expect(TT.LPAREN)
d1 = self._parse_expr()
self._expect(TT.COMMA)
d2 = self._parse_expr()
self._expect(TT.COMMA)
unit_arg = self._parse_expr()
self._expect(TT.RPAREN)
unit = unit_arg.strip("'").rstrip("s").upper()
return f"DATEDIFF('{unit}', {d2}, {d1})"
# IIF as identifier (keyword token is TT.IIF but may arrive as IDENT)
if upper == "IIF":
self._expect(TT.LPAREN)
cond = self._parse_expr()
self._expect(TT.COMMA)
true_val = self._parse_expr()
self._expect(TT.COMMA)
false_val = self._parse_expr()
self._expect(TT.RPAREN)
return f"(CASE WHEN {cond} THEN {true_val} ELSE {false_val} END)"
if self._cur.type == TT.LPAREN:
# Function call
self._advance()
args: list[str] = []
if self._cur.type != TT.RPAREN:
args.append(self._parse_expr())
while self._cur.type == TT.COMMA:
self._advance()
args.append(self._parse_expr())
self._expect(TT.RPAREN)
return get_function_sql(name, args)
# Bare identifier (e.g. a column name without brackets — unusual)
return f'"{name}"'
# IIF keyword token
if tok.type == TT.IIF:
self._advance()
self._expect(TT.LPAREN)
cond = self._parse_expr()
self._expect(TT.COMMA)
true_val = self._parse_expr()
self._expect(TT.COMMA)
false_val = self._parse_expr()
self._expect(TT.RPAREN)
return f"(CASE WHEN {cond} THEN {true_val} ELSE {false_val} END)"
raise UnsupportedExpressionError(f"Unexpected token: {tok}")
def _parse_if(self) -> str:
self._expect(TT.IF)
branches: list[tuple[str, str]] = []
else_val: Optional[str] = None
cond = self._parse_expr()
self._expect(TT.THEN)
val = self._parse_expr()
branches.append((cond, val))
while self._cur.type == TT.ELSEIF:
self._advance()
cond = self._parse_expr()
self._expect(TT.THEN)
val = self._parse_expr()
branches.append((cond, val))
if self._cur.type == TT.ELSE:
self._advance()
else_val = self._parse_expr()
self._expect(TT.ENDIF)
parts = ["CASE"]
for cond, val in branches:
parts.append(f"WHEN {cond} THEN {val}")
if else_val is not None:
parts.append(f"ELSE {else_val}")
parts.append("END")
return " ".join(parts)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def transpile(expression: str) -> str:
"""Convert an Alteryx expression string to a DuckDB SQL fragment."""
expression = expression.strip()
if not expression:
return "NULL"
tokens = tokenise(expression)
return _Parser(tokens).parse()
def _coerce_numeric_strings(df: pl.DataFrame) -> pl.DataFrame:
"""Cast string columns that contain only numeric data to Int64 or Float64.
Alteryx implicitly coerces TextInput strings to numbers when the expression
treats them numerically. This mirrors that behaviour.
"""
casts: list[pl.Expr] = []
for col_name in df.columns:
s = df[col_name]
if s.dtype != pl.String:
continue
non_null = s.drop_nulls()
if len(non_null) == 0:
continue
# Try integer first (covers integer-looking strings)
int_s = non_null.cast(pl.Int64, strict=False)
if int_s.null_count() == 0:
casts.append(pl.col(col_name).cast(pl.Int64, strict=False))
continue
# Try float
float_s = non_null.cast(pl.Float64, strict=False)
if float_s.null_count() == 0:
casts.append(pl.col(col_name).cast(pl.Float64, strict=False))
return df.with_columns(casts) if casts else df
class ExpressionTranspiler:
"""Stateful transpiler bound to a DuckDB connection for evaluation."""
def __init__(self, con: duckdb.DuckDBPyConnection):
self._con = con
self._view_counter = 0
def _register(self, df: pl.DataFrame) -> str:
name = f"_expr_df_{self._view_counter}"
self._view_counter += 1
self._con.register(name, df.to_arrow())
return name
def eval_mask(self, df: pl.DataFrame, expression: str) -> pl.Series:
"""Evaluate a boolean Alteryx expression against df, return bool Series."""
sql_expr = transpile(expression)
view = self._register(df)
try:
result = self._con.execute(
f'SELECT ({sql_expr}) AS _mask FROM "{view}"'
).pl()
return result["_mask"]
except duckdb.BinderException:
# Type mismatch: retry after coercing numeric-looking string columns
self._con.execute(f'DROP VIEW IF EXISTS "{view}"')
df2 = _coerce_numeric_strings(df)
view = self._register(df2)
result = self._con.execute(
f'SELECT ({sql_expr}) AS _mask FROM "{view}"'
).pl()
return result["_mask"]
finally:
self._con.execute(f'DROP VIEW IF EXISTS "{view}"')
def eval_series(
self,
df: pl.DataFrame,
expression: str,
field: str,
dtype: pl.PolarsDataType,
) -> pl.Series:
"""Evaluate a scalar Alteryx expression against df, return a Series."""
sql_expr = transpile(expression)
view = self._register(df)
try:
result = self._con.execute(
f'SELECT ({sql_expr}) AS "{field}" FROM "{view}"'
).pl()
series = result[field]
try:
return series.cast(dtype)
except Exception:
return series
except duckdb.BinderException:
self._con.execute(f'DROP VIEW IF EXISTS "{view}"')
df2 = _coerce_numeric_strings(df)
view = self._register(df2)
result = self._con.execute(
f'SELECT ({sql_expr}) AS "{field}" FROM "{view}"'
).pl()
series = result[field]
try:
return series.cast(dtype)
except Exception:
return series
finally:
self._con.execute(f'DROP VIEW IF EXISTS "{view}"')
def eval_scalar(self, expression: str) -> object:
"""Evaluate an expression that requires no input columns."""
sql_expr = transpile(expression)
result = self._con.execute(f"SELECT ({sql_expr})").fetchone()
return result[0] if result else None