join fix and runner update to support multiple output files

main
casey 2026-06-13 08:40:01 +10:00
parent ab3d7ab971
commit dd1431760a
7 changed files with 1717 additions and 1709 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Alteryx example files
Alteryx_TestWorkflows/JoinTesting/Output/
!Alteryx_TestWorkflows/JoinTesting/Output/**/
# uv
uv.lock

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
Left_Store_ID,Product_ID,Left_Stock_On_Hand
Store_ID,Product_ID,Stock_On_Hand
15,31,4
15,32,16
15,33,8
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
28,32,3
28,33,9
28,34,19
42,31,11
42,32,4
42,33,18
42,34,34
42,35,13
43,31,18
43,32,38
43,33,5
43,34,7
44,31,8
44,32,29
44,33,0
44,34,22
45,31,6
45,32,6
45,33,7
45,34,3
46,31,13
46,32,8
46,33,11
46,34,24
47,31,48
47,32,6
47,33,13
47,34,3
48,31,41
48,32,7
48,33,0
48,34,39
48,35,3
49,31,51
49,32,11
49,33,15
49,34,2
49,35,19
50,31,18
50,32,9
50,33,1
50,34,17
50,35,8
29,31,3
29,32,7
29,33,6
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
14,32,2
14,33,2
14,34,8
42,31,11
42,32,4
42,33,18
42,34,34
42,35,13
43,31,18
43,32,38
43,33,5
43,34,7
44,31,8
44,32,29
44,33,0
44,34,22
45,31,6
45,32,6
45,33,7
45,34,3
46,31,13
46,32,8
46,33,11
46,34,24
47,31,48
47,32,6
47,33,13
47,34,3
48,31,41
48,32,7
48,33,0
48,34,39
48,35,3
49,31,51
49,32,11
49,33,15
49,34,2
49,35,19
50,31,18
50,32,9
50,33,1
50,34,17
50,35,8

1 Left_Store_ID Store_ID Product_ID Left_Stock_On_Hand Stock_On_Hand
2 15 31 4
3 15 32 16
4 15 33 8
62 28 32 3
63 28 33 9
64 28 34 19
65 42 31 11
66 42 32 4
67 42 33 18
68 42 34 34
69 42 35 13
70 43 31 18
71 43 32 38
72 43 33 5
73 43 34 7
74 44 31 8
75 44 32 29
76 44 33 0
77 44 34 22
78 45 31 6
79 45 32 6
80 45 33 7
81 45 34 3
82 46 31 13
83 46 32 8
84 46 33 11
85 46 34 24
86 47 31 48
87 47 32 6
88 47 33 13
89 47 34 3
90 48 31 41
91 48 32 7
92 48 33 0
93 48 34 39
94 48 35 3
95 49 31 51
96 49 32 11
97 49 33 15
98 49 34 2
99 49 35 19
100 50 31 18
101 50 32 9
102 50 33 1
103 50 34 17
104 50 35 8
105 29 31 3
106 29 32 7
107 29 33 6
224 14 32 2
225 14 33 2
226 14 34 8
42 31 11
42 32 4
42 33 18
42 34 34
42 35 13
43 31 18
43 32 38
43 33 5
43 34 7
44 31 8
44 32 29
44 33 0
44 34 22
45 31 6
45 32 6
45 33 7
45 34 3
46 31 13
46 32 8
46 33 11
46 34 24
47 31 48
47 32 6
47 33 13
47 34 3
48 31 41
48 32 7
48 33 0
48 34 39
48 35 3
49 31 51
49 32 11
49 33 15
49 34 2
49 35 19
50 31 18
50 32 9
50 33 1
50 34 17
50 35 8

View File

@ -1,2 +1,2 @@
Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price
Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
100,Non-product,NoCat,$1,$1

1 Product_ID Right_Product_Name Product_Name Right_Product_Category Product_Category Right_Product_Cost Product_Cost Right_Product_Price Product_Price
2 100 Non-product Non-product NoCat NoCat $1 $1 $1 $1

View File

@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
or "True"
)
header = header_val.lower() != "false"
df.write_csv(str(path), separator=delim, include_header=header)
line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
eol = "\r\n" if line_end == "CRLF" else "\n"
df.write_csv(str(path), separator=delim, include_header=header,
line_terminator=eol)
elif fmt == 25: # Excel
df.write_excel(str(path))
elif fmt == 2: # Parquet

View File

@ -78,62 +78,68 @@ class JoinTool(BaseTool):
if select_fields is None:
return df
# Build column mapping
# First, collect explicitly selected fields
explicit_selections = [] # list of (src_col, output_name)
order_changed_el = cfg.find("OrderChanged")
order_changed = (
order_changed_el is not None
and order_changed_el.attrib.get("value", "False") == "True"
)
# Parse field rules
rename_map: dict[str, str] = {} # src_col → output_name
exclude_set: set[str] = set() # columns explicitly excluded
explicit_order: list[str] = [] # for OrderChanged=True
has_unknown = False
unknown_selected = True
for sf in select_fields.findall("SelectField"):
field = sf.attrib.get("field", "")
selected = sf.attrib.get("selected", "False") == "True"
selected = sf.attrib.get("selected", "True") == "True"
rename = sf.attrib.get("rename", "")
input_prefix = sf.attrib.get("input", "")
if not selected:
continue
if field == "*Unknown":
has_unknown = True
unknown_selected = selected
continue
# Resolve column name in the DataFrame
if field not in df.columns:
continue
if not selected:
exclude_set.add(field)
else:
# Find the column with prefix
src_col = f"{input_prefix}{field}" if input_prefix else field
if src_col in df.columns:
output_name = rename if rename else field
explicit_selections.append((src_col, output_name))
elif field in df.columns:
output_name = rename if rename else field
explicit_selections.append((field, output_name))
explicit_order.append(field)
if rename and rename != field:
rename_map[field] = rename
# Build final column list
selected_cols = []
rename_map = {}
mentioned = set(explicit_order) | exclude_set
# Add explicitly selected columns
for src, dst in explicit_selections:
selected_cols.append(src)
if src != dst:
rename_map[src] = dst
# Handle *Unknown: include all remaining columns, stripping prefixes
if has_unknown:
explicit_srcs = {src for src, _ in explicit_selections}
if order_changed:
# Explicit selections first (in specified order), then *Unknown
final_cols = list(explicit_order)
if has_unknown and unknown_selected:
for col in df.columns:
if col not in explicit_srcs:
# Strip Left_/Right_ prefix for output name
output_name = col
if col.startswith("Left_"):
output_name = col[5:]
elif col.startswith("Right_"):
output_name = col[6:]
selected_cols.append(col)
if col != output_name:
rename_map[col] = output_name
if col not in mentioned:
final_cols.append(col)
else:
# Preserve original DataFrame column order
final_cols = []
for col in df.columns:
if col in exclude_set:
continue
if col in mentioned or (has_unknown and unknown_selected):
final_cols.append(col)
elif not has_unknown and col not in mentioned:
# Default: include if not explicitly excluded
final_cols.append(col)
# Apply selection and renaming
if selected_cols:
df = df.select(selected_cols)
if final_cols:
df = df.select(final_cols)
if rename_map:
df = df.rename(rename_map)
df = df.rename(
{k: v for k, v in rename_map.items() if k in df.columns}
)
break
return df
@ -145,45 +151,40 @@ class JoinTool(BaseTool):
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
con = self.ctx.duckdb_con
# Disambiguate conflicting column names
key_l = {k[0] for k in join_keys}
key_r = {k[1] for k in join_keys}
l_non_key = [c for c in left.columns if c not in key_l]
r_non_key = [c for c in right.columns if c not in key_r]
# Only right non-key columns that clash with left columns need a prefix
conflicts = set(l_non_key) & set(r_non_key)
# Prefix all non-key columns: Left_ for left, Right_ for right
# This matches Alteryx behavior where SelectConfiguration references
# fields with these prefixes
rename_l = {c: f"Left_{c}" for c in l_non_key}
rename_r = {c: f"Right_{c}" for c in r_non_key}
# But keep join keys without prefix (they come from left)
left_r = left.rename(rename_l) if rename_l else left
right_r = right.rename(rename_r) if rename_r else right
con.register("__join_left__", left_r.to_arrow())
con.register("__join_right__", right_r.to_arrow())
# Map renamed key column names
def lk(k: str) -> str:
return rename_l.get(k, k)
def rk(k: str) -> str:
return rename_r.get(k, k)
# Register the original (un-prefixed) tables
con.register("__join_left__", left.to_arrow())
con.register("__join_right__", right.to_arrow())
on_clause = " AND ".join(
f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys
f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
)
# Include right join keys with Right_ prefix for SelectConfiguration
r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys)
r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key)
if r_key_cols_sql:
r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}"
# --- Inner join SELECT ------------------------------------------------
# Left columns first (no prefix), then right join keys with Right_
# prefix, then right non-key columns (Right_ prefix only on conflicts).
l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
r_key_cols_sql = ", ".join(
f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
)
r_non_key_sql = ", ".join(
f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
for c in r_non_key
)
j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
j_select = ", ".join(j_parts)
r_key0 = rk(join_keys[0][1])
l_key0 = lk(join_keys[0][0])
j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
l_key0 = join_keys[0][0]
r_key0 = join_keys[0][1]
j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
# Left/right unmatched keep original column names (no prefixes)
l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'