join fix and runner update to support multiple output files

main
casey 2026-06-13 08:40:01 +10:00
parent ab3d7ab971
commit dd1431760a
7 changed files with 1717 additions and 1709 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Alteryx example files
Alteryx_TestWorkflows/JoinTesting/Output/
!Alteryx_TestWorkflows/JoinTesting/Output/**/
# uv # uv
uv.lock uv.lock

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
Left_Store_ID,Product_ID,Left_Stock_On_Hand Store_ID,Product_ID,Stock_On_Hand
15,31,4 15,31,4
15,32,16 15,32,16
15,33,8 15,33,8
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
28,32,3 28,32,3
28,33,9 28,33,9
28,34,19 28,34,19
42,31,11
42,32,4
42,33,18
42,34,34
42,35,13
43,31,18
43,32,38
43,33,5
43,34,7
44,31,8
44,32,29
44,33,0
44,34,22
45,31,6
45,32,6
45,33,7
45,34,3
46,31,13
46,32,8
46,33,11
46,34,24
47,31,48
47,32,6
47,33,13
47,34,3
48,31,41
48,32,7
48,33,0
48,34,39
48,35,3
49,31,51
49,32,11
49,33,15
49,34,2
49,35,19
50,31,18
50,32,9
50,33,1
50,34,17
50,35,8
29,31,3 29,31,3
29,32,7 29,32,7
29,33,6 29,33,6
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
14,32,2 14,32,2
14,33,2 14,33,2
14,34,8 14,34,8
42,31,11
42,32,4
42,33,18
42,34,34
42,35,13
43,31,18
43,32,38
43,33,5
43,34,7
44,31,8
44,32,29
44,33,0
44,34,22
45,31,6
45,32,6
45,33,7
45,34,3
46,31,13
46,32,8
46,33,11
46,34,24
47,31,48
47,32,6
47,33,13
47,34,3
48,31,41
48,32,7
48,33,0
48,34,39
48,35,3
49,31,51
49,32,11
49,33,15
49,34,2
49,35,19
50,31,18
50,32,9
50,33,1
50,34,17
50,35,8

1 Left_Store_ID Store_ID Product_ID Left_Stock_On_Hand Stock_On_Hand
2 15 31 4
3 15 32 16
4 15 33 8
62 28 32 3
63 28 33 9
64 28 34 19
65 42 31 11
66 42 32 4
67 42 33 18
68 42 34 34
69 42 35 13
70 43 31 18
71 43 32 38
72 43 33 5
73 43 34 7
74 44 31 8
75 44 32 29
76 44 33 0
77 44 34 22
78 45 31 6
79 45 32 6
80 45 33 7
81 45 34 3
82 46 31 13
83 46 32 8
84 46 33 11
85 46 34 24
86 47 31 48
87 47 32 6
88 47 33 13
89 47 34 3
90 48 31 41
91 48 32 7
92 48 33 0
93 48 34 39
94 48 35 3
95 49 31 51
96 49 32 11
97 49 33 15
98 49 34 2
99 49 35 19
100 50 31 18
101 50 32 9
102 50 33 1
103 50 34 17
104 50 35 8
105 29 31 3
106 29 32 7
107 29 33 6
224 14 32 2
225 14 33 2
226 14 34 8
42 31 11
42 32 4
42 33 18
42 34 34
42 35 13
43 31 18
43 32 38
43 33 5
43 34 7
44 31 8
44 32 29
44 33 0
44 34 22
45 31 6
45 32 6
45 33 7
45 34 3
46 31 13
46 32 8
46 33 11
46 34 24
47 31 48
47 32 6
47 33 13
47 34 3
48 31 41
48 32 7
48 33 0
48 34 39
48 35 3
49 31 51
49 32 11
49 33 15
49 34 2
49 35 19
50 31 18
50 32 9
50 33 1
50 34 17
50 35 8

View File

@ -1,2 +1,2 @@
Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
100,Non-product,NoCat,$1,$1 100,Non-product,NoCat,$1,$1

1 Product_ID Right_Product_Name Product_Name Right_Product_Category Product_Category Right_Product_Cost Product_Cost Right_Product_Price Product_Price
2 100 Non-product Non-product NoCat NoCat $1 $1 $1 $1

View File

@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
or "True" or "True"
) )
header = header_val.lower() != "false" header = header_val.lower() != "false"
df.write_csv(str(path), separator=delim, include_header=header) line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
eol = "\r\n" if line_end == "CRLF" else "\n"
df.write_csv(str(path), separator=delim, include_header=header,
line_terminator=eol)
elif fmt == 25: # Excel elif fmt == 25: # Excel
df.write_excel(str(path)) df.write_excel(str(path))
elif fmt == 2: # Parquet elif fmt == 2: # Parquet

View File

@ -78,62 +78,68 @@ class JoinTool(BaseTool):
if select_fields is None: if select_fields is None:
return df return df
# Build column mapping order_changed_el = cfg.find("OrderChanged")
# First, collect explicitly selected fields order_changed = (
explicit_selections = [] # list of (src_col, output_name) order_changed_el is not None
and order_changed_el.attrib.get("value", "False") == "True"
)
# Parse field rules
rename_map: dict[str, str] = {} # src_col → output_name
exclude_set: set[str] = set() # columns explicitly excluded
explicit_order: list[str] = [] # for OrderChanged=True
has_unknown = False has_unknown = False
unknown_selected = True
for sf in select_fields.findall("SelectField"): for sf in select_fields.findall("SelectField"):
field = sf.attrib.get("field", "") field = sf.attrib.get("field", "")
selected = sf.attrib.get("selected", "False") == "True" selected = sf.attrib.get("selected", "True") == "True"
rename = sf.attrib.get("rename", "") rename = sf.attrib.get("rename", "")
input_prefix = sf.attrib.get("input", "")
if not selected:
continue
if field == "*Unknown": if field == "*Unknown":
has_unknown = True has_unknown = True
unknown_selected = selected
continue
# Resolve column name in the DataFrame
if field not in df.columns:
continue
if not selected:
exclude_set.add(field)
else: else:
# Find the column with prefix explicit_order.append(field)
src_col = f"{input_prefix}{field}" if input_prefix else field if rename and rename != field:
if src_col in df.columns: rename_map[field] = rename
output_name = rename if rename else field
explicit_selections.append((src_col, output_name))
elif field in df.columns:
output_name = rename if rename else field
explicit_selections.append((field, output_name))
# Build final column list # Build final column list
selected_cols = [] mentioned = set(explicit_order) | exclude_set
rename_map = {}
# Add explicitly selected columns if order_changed:
for src, dst in explicit_selections: # Explicit selections first (in specified order), then *Unknown
selected_cols.append(src) final_cols = list(explicit_order)
if src != dst: if has_unknown and unknown_selected:
rename_map[src] = dst for col in df.columns:
if col not in mentioned:
# Handle *Unknown: include all remaining columns, stripping prefixes final_cols.append(col)
if has_unknown: else:
explicit_srcs = {src for src, _ in explicit_selections} # Preserve original DataFrame column order
final_cols = []
for col in df.columns: for col in df.columns:
if col not in explicit_srcs: if col in exclude_set:
# Strip Left_/Right_ prefix for output name continue
output_name = col if col in mentioned or (has_unknown and unknown_selected):
if col.startswith("Left_"): final_cols.append(col)
output_name = col[5:] elif not has_unknown and col not in mentioned:
elif col.startswith("Right_"): # Default: include if not explicitly excluded
output_name = col[6:] final_cols.append(col)
selected_cols.append(col)
if col != output_name:
rename_map[col] = output_name
# Apply selection and renaming if final_cols:
if selected_cols: df = df.select(final_cols)
df = df.select(selected_cols) if rename_map:
if rename_map: df = df.rename(
df = df.rename(rename_map) {k: v for k, v in rename_map.items() if k in df.columns}
)
break break
return df return df
@ -145,45 +151,40 @@ class JoinTool(BaseTool):
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]: ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
con = self.ctx.duckdb_con con = self.ctx.duckdb_con
# Disambiguate conflicting column names
key_l = {k[0] for k in join_keys} key_l = {k[0] for k in join_keys}
key_r = {k[1] for k in join_keys} key_r = {k[1] for k in join_keys}
l_non_key = [c for c in left.columns if c not in key_l] l_non_key = [c for c in left.columns if c not in key_l]
r_non_key = [c for c in right.columns if c not in key_r] r_non_key = [c for c in right.columns if c not in key_r]
# Only right non-key columns that clash with left columns need a prefix
conflicts = set(l_non_key) & set(r_non_key) conflicts = set(l_non_key) & set(r_non_key)
# Prefix all non-key columns: Left_ for left, Right_ for right # Register the original (un-prefixed) tables
# This matches Alteryx behavior where SelectConfiguration references con.register("__join_left__", left.to_arrow())
# fields with these prefixes con.register("__join_right__", right.to_arrow())
rename_l = {c: f"Left_{c}" for c in l_non_key}
rename_r = {c: f"Right_{c}" for c in r_non_key}
# But keep join keys without prefix (they come from left)
left_r = left.rename(rename_l) if rename_l else left
right_r = right.rename(rename_r) if rename_r else right
con.register("__join_left__", left_r.to_arrow())
con.register("__join_right__", right_r.to_arrow())
# Map renamed key column names
def lk(k: str) -> str:
return rename_l.get(k, k)
def rk(k: str) -> str:
return rename_r.get(k, k)
on_clause = " AND ".join( on_clause = " AND ".join(
f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
) )
# Include right join keys with Right_ prefix for SelectConfiguration # --- Inner join SELECT ------------------------------------------------
r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys) # Left columns first (no prefix), then right join keys with Right_
r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key) # prefix, then right non-key columns (Right_ prefix only on conflicts).
if r_key_cols_sql: l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}" r_key_cols_sql = ", ".join(
f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
)
r_non_key_sql = ", ".join(
f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
for c in r_non_key
)
j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
j_select = ", ".join(j_parts)
r_key0 = rk(join_keys[0][1]) l_key0 = join_keys[0][0]
l_key0 = lk(join_keys[0][0]) r_key0 = join_keys[0][1]
j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
# Left/right unmatched keep original column names (no prefixes)
l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL' l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL' r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'