join fix and runner update to support multiple output files
parent
ab3d7ab971
commit
dd1431760a
|
|
@ -1,3 +1,7 @@
|
|||
# Alteryx example files
|
||||
Alteryx_TestWorkflows/JoinTesting/Output/
|
||||
!Alteryx_TestWorkflows/JoinTesting/Output/**/
|
||||
|
||||
# uv
|
||||
uv.lock
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,4 +1,4 @@
|
|||
Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
||||
Store_ID,Product_ID,Stock_On_Hand
|
||||
15,31,4
|
||||
15,32,16
|
||||
15,33,8
|
||||
|
|
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
|||
28,32,3
|
||||
28,33,9
|
||||
28,34,19
|
||||
42,31,11
|
||||
42,32,4
|
||||
42,33,18
|
||||
42,34,34
|
||||
42,35,13
|
||||
43,31,18
|
||||
43,32,38
|
||||
43,33,5
|
||||
43,34,7
|
||||
44,31,8
|
||||
44,32,29
|
||||
44,33,0
|
||||
44,34,22
|
||||
45,31,6
|
||||
45,32,6
|
||||
45,33,7
|
||||
45,34,3
|
||||
46,31,13
|
||||
46,32,8
|
||||
46,33,11
|
||||
46,34,24
|
||||
47,31,48
|
||||
47,32,6
|
||||
47,33,13
|
||||
47,34,3
|
||||
48,31,41
|
||||
48,32,7
|
||||
48,33,0
|
||||
48,34,39
|
||||
48,35,3
|
||||
49,31,51
|
||||
49,32,11
|
||||
49,33,15
|
||||
49,34,2
|
||||
49,35,19
|
||||
50,31,18
|
||||
50,32,9
|
||||
50,33,1
|
||||
50,34,17
|
||||
50,35,8
|
||||
29,31,3
|
||||
29,32,7
|
||||
29,33,6
|
||||
|
|
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
|||
14,32,2
|
||||
14,33,2
|
||||
14,34,8
|
||||
42,31,11
|
||||
42,32,4
|
||||
42,33,18
|
||||
42,34,34
|
||||
42,35,13
|
||||
43,31,18
|
||||
43,32,38
|
||||
43,33,5
|
||||
43,34,7
|
||||
44,31,8
|
||||
44,32,29
|
||||
44,33,0
|
||||
44,34,22
|
||||
45,31,6
|
||||
45,32,6
|
||||
45,33,7
|
||||
45,34,3
|
||||
46,31,13
|
||||
46,32,8
|
||||
46,33,11
|
||||
46,34,24
|
||||
47,31,48
|
||||
47,32,6
|
||||
47,33,13
|
||||
47,34,3
|
||||
48,31,41
|
||||
48,32,7
|
||||
48,33,0
|
||||
48,34,39
|
||||
48,35,3
|
||||
49,31,51
|
||||
49,32,11
|
||||
49,33,15
|
||||
49,34,2
|
||||
49,35,19
|
||||
50,31,18
|
||||
50,32,9
|
||||
50,33,1
|
||||
50,34,17
|
||||
50,35,8
|
||||
|
|
|
|||
|
|
|
@ -1,2 +1,2 @@
|
|||
Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price
|
||||
Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
|
||||
100,Non-product,NoCat,$1,$1
|
||||
|
|
|
|||
|
|
|
@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
|
|||
or "True"
|
||||
)
|
||||
header = header_val.lower() != "false"
|
||||
df.write_csv(str(path), separator=delim, include_header=header)
|
||||
line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
|
||||
eol = "\r\n" if line_end == "CRLF" else "\n"
|
||||
df.write_csv(str(path), separator=delim, include_header=header,
|
||||
line_terminator=eol)
|
||||
elif fmt == 25: # Excel
|
||||
df.write_excel(str(path))
|
||||
elif fmt == 2: # Parquet
|
||||
|
|
|
|||
|
|
@ -78,62 +78,68 @@ class JoinTool(BaseTool):
|
|||
if select_fields is None:
|
||||
return df
|
||||
|
||||
# Build column mapping
|
||||
# First, collect explicitly selected fields
|
||||
explicit_selections = [] # list of (src_col, output_name)
|
||||
order_changed_el = cfg.find("OrderChanged")
|
||||
order_changed = (
|
||||
order_changed_el is not None
|
||||
and order_changed_el.attrib.get("value", "False") == "True"
|
||||
)
|
||||
|
||||
# Parse field rules
|
||||
rename_map: dict[str, str] = {} # src_col → output_name
|
||||
exclude_set: set[str] = set() # columns explicitly excluded
|
||||
explicit_order: list[str] = [] # for OrderChanged=True
|
||||
has_unknown = False
|
||||
unknown_selected = True
|
||||
|
||||
for sf in select_fields.findall("SelectField"):
|
||||
field = sf.attrib.get("field", "")
|
||||
selected = sf.attrib.get("selected", "False") == "True"
|
||||
selected = sf.attrib.get("selected", "True") == "True"
|
||||
rename = sf.attrib.get("rename", "")
|
||||
input_prefix = sf.attrib.get("input", "")
|
||||
|
||||
if not selected:
|
||||
continue
|
||||
|
||||
if field == "*Unknown":
|
||||
has_unknown = True
|
||||
unknown_selected = selected
|
||||
continue
|
||||
|
||||
# Resolve column name in the DataFrame
|
||||
if field not in df.columns:
|
||||
continue
|
||||
|
||||
if not selected:
|
||||
exclude_set.add(field)
|
||||
else:
|
||||
# Find the column with prefix
|
||||
src_col = f"{input_prefix}{field}" if input_prefix else field
|
||||
if src_col in df.columns:
|
||||
output_name = rename if rename else field
|
||||
explicit_selections.append((src_col, output_name))
|
||||
elif field in df.columns:
|
||||
output_name = rename if rename else field
|
||||
explicit_selections.append((field, output_name))
|
||||
explicit_order.append(field)
|
||||
if rename and rename != field:
|
||||
rename_map[field] = rename
|
||||
|
||||
# Build final column list
|
||||
selected_cols = []
|
||||
rename_map = {}
|
||||
mentioned = set(explicit_order) | exclude_set
|
||||
|
||||
# Add explicitly selected columns
|
||||
for src, dst in explicit_selections:
|
||||
selected_cols.append(src)
|
||||
if src != dst:
|
||||
rename_map[src] = dst
|
||||
|
||||
# Handle *Unknown: include all remaining columns, stripping prefixes
|
||||
if has_unknown:
|
||||
explicit_srcs = {src for src, _ in explicit_selections}
|
||||
if order_changed:
|
||||
# Explicit selections first (in specified order), then *Unknown
|
||||
final_cols = list(explicit_order)
|
||||
if has_unknown and unknown_selected:
|
||||
for col in df.columns:
|
||||
if col not in mentioned:
|
||||
final_cols.append(col)
|
||||
else:
|
||||
# Preserve original DataFrame column order
|
||||
final_cols = []
|
||||
for col in df.columns:
|
||||
if col not in explicit_srcs:
|
||||
# Strip Left_/Right_ prefix for output name
|
||||
output_name = col
|
||||
if col.startswith("Left_"):
|
||||
output_name = col[5:]
|
||||
elif col.startswith("Right_"):
|
||||
output_name = col[6:]
|
||||
selected_cols.append(col)
|
||||
if col != output_name:
|
||||
rename_map[col] = output_name
|
||||
if col in exclude_set:
|
||||
continue
|
||||
if col in mentioned or (has_unknown and unknown_selected):
|
||||
final_cols.append(col)
|
||||
elif not has_unknown and col not in mentioned:
|
||||
# Default: include if not explicitly excluded
|
||||
final_cols.append(col)
|
||||
|
||||
# Apply selection and renaming
|
||||
if selected_cols:
|
||||
df = df.select(selected_cols)
|
||||
if rename_map:
|
||||
df = df.rename(rename_map)
|
||||
if final_cols:
|
||||
df = df.select(final_cols)
|
||||
if rename_map:
|
||||
df = df.rename(
|
||||
{k: v for k, v in rename_map.items() if k in df.columns}
|
||||
)
|
||||
break
|
||||
return df
|
||||
|
||||
|
|
@ -145,45 +151,40 @@ class JoinTool(BaseTool):
|
|||
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
|
||||
con = self.ctx.duckdb_con
|
||||
|
||||
# Disambiguate conflicting column names
|
||||
key_l = {k[0] for k in join_keys}
|
||||
key_r = {k[1] for k in join_keys}
|
||||
l_non_key = [c for c in left.columns if c not in key_l]
|
||||
r_non_key = [c for c in right.columns if c not in key_r]
|
||||
# Only right non-key columns that clash with left columns need a prefix
|
||||
conflicts = set(l_non_key) & set(r_non_key)
|
||||
|
||||
# Prefix all non-key columns: Left_ for left, Right_ for right
|
||||
# This matches Alteryx behavior where SelectConfiguration references
|
||||
# fields with these prefixes
|
||||
rename_l = {c: f"Left_{c}" for c in l_non_key}
|
||||
rename_r = {c: f"Right_{c}" for c in r_non_key}
|
||||
# But keep join keys without prefix (they come from left)
|
||||
left_r = left.rename(rename_l) if rename_l else left
|
||||
right_r = right.rename(rename_r) if rename_r else right
|
||||
|
||||
con.register("__join_left__", left_r.to_arrow())
|
||||
con.register("__join_right__", right_r.to_arrow())
|
||||
|
||||
# Map renamed key column names
|
||||
def lk(k: str) -> str:
|
||||
return rename_l.get(k, k)
|
||||
|
||||
def rk(k: str) -> str:
|
||||
return rename_r.get(k, k)
|
||||
# Register the original (un-prefixed) tables
|
||||
con.register("__join_left__", left.to_arrow())
|
||||
con.register("__join_right__", right.to_arrow())
|
||||
|
||||
on_clause = " AND ".join(
|
||||
f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys
|
||||
f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
|
||||
)
|
||||
|
||||
# Include right join keys with Right_ prefix for SelectConfiguration
|
||||
r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys)
|
||||
r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key)
|
||||
if r_key_cols_sql:
|
||||
r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}"
|
||||
# --- Inner join SELECT ------------------------------------------------
|
||||
# Left columns first (no prefix), then right join keys with Right_
|
||||
# prefix, then right non-key columns (Right_ prefix only on conflicts).
|
||||
l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
|
||||
r_key_cols_sql = ", ".join(
|
||||
f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
|
||||
)
|
||||
r_non_key_sql = ", ".join(
|
||||
f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
|
||||
for c in r_non_key
|
||||
)
|
||||
j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
|
||||
j_select = ", ".join(j_parts)
|
||||
|
||||
r_key0 = rk(join_keys[0][1])
|
||||
l_key0 = lk(join_keys[0][0])
|
||||
j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
|
||||
l_key0 = join_keys[0][0]
|
||||
r_key0 = join_keys[0][1]
|
||||
|
||||
j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
|
||||
# Left/right unmatched keep original column names (no prefixes)
|
||||
l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
|
||||
r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue