join fix and runner update to support multiple output files
parent
ab3d7ab971
commit
dd1431760a
|
|
@ -1,3 +1,7 @@
|
||||||
|
# Alteryx example files
|
||||||
|
Alteryx_TestWorkflows/JoinTesting/Output/
|
||||||
|
!Alteryx_TestWorkflows/JoinTesting/Output/**/
|
||||||
|
|
||||||
# uv
|
# uv
|
||||||
uv.lock
|
uv.lock
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,4 +1,4 @@
|
||||||
Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
Store_ID,Product_ID,Stock_On_Hand
|
||||||
15,31,4
|
15,31,4
|
||||||
15,32,16
|
15,32,16
|
||||||
15,33,8
|
15,33,8
|
||||||
|
|
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
||||||
28,32,3
|
28,32,3
|
||||||
28,33,9
|
28,33,9
|
||||||
28,34,19
|
28,34,19
|
||||||
|
42,31,11
|
||||||
|
42,32,4
|
||||||
|
42,33,18
|
||||||
|
42,34,34
|
||||||
|
42,35,13
|
||||||
|
43,31,18
|
||||||
|
43,32,38
|
||||||
|
43,33,5
|
||||||
|
43,34,7
|
||||||
|
44,31,8
|
||||||
|
44,32,29
|
||||||
|
44,33,0
|
||||||
|
44,34,22
|
||||||
|
45,31,6
|
||||||
|
45,32,6
|
||||||
|
45,33,7
|
||||||
|
45,34,3
|
||||||
|
46,31,13
|
||||||
|
46,32,8
|
||||||
|
46,33,11
|
||||||
|
46,34,24
|
||||||
|
47,31,48
|
||||||
|
47,32,6
|
||||||
|
47,33,13
|
||||||
|
47,34,3
|
||||||
|
48,31,41
|
||||||
|
48,32,7
|
||||||
|
48,33,0
|
||||||
|
48,34,39
|
||||||
|
48,35,3
|
||||||
|
49,31,51
|
||||||
|
49,32,11
|
||||||
|
49,33,15
|
||||||
|
49,34,2
|
||||||
|
49,35,19
|
||||||
|
50,31,18
|
||||||
|
50,32,9
|
||||||
|
50,33,1
|
||||||
|
50,34,17
|
||||||
|
50,35,8
|
||||||
29,31,3
|
29,31,3
|
||||||
29,32,7
|
29,32,7
|
||||||
29,33,6
|
29,33,6
|
||||||
|
|
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
|
||||||
14,32,2
|
14,32,2
|
||||||
14,33,2
|
14,33,2
|
||||||
14,34,8
|
14,34,8
|
||||||
42,31,11
|
|
||||||
42,32,4
|
|
||||||
42,33,18
|
|
||||||
42,34,34
|
|
||||||
42,35,13
|
|
||||||
43,31,18
|
|
||||||
43,32,38
|
|
||||||
43,33,5
|
|
||||||
43,34,7
|
|
||||||
44,31,8
|
|
||||||
44,32,29
|
|
||||||
44,33,0
|
|
||||||
44,34,22
|
|
||||||
45,31,6
|
|
||||||
45,32,6
|
|
||||||
45,33,7
|
|
||||||
45,34,3
|
|
||||||
46,31,13
|
|
||||||
46,32,8
|
|
||||||
46,33,11
|
|
||||||
46,34,24
|
|
||||||
47,31,48
|
|
||||||
47,32,6
|
|
||||||
47,33,13
|
|
||||||
47,34,3
|
|
||||||
48,31,41
|
|
||||||
48,32,7
|
|
||||||
48,33,0
|
|
||||||
48,34,39
|
|
||||||
48,35,3
|
|
||||||
49,31,51
|
|
||||||
49,32,11
|
|
||||||
49,33,15
|
|
||||||
49,34,2
|
|
||||||
49,35,19
|
|
||||||
50,31,18
|
|
||||||
50,32,9
|
|
||||||
50,33,1
|
|
||||||
50,34,17
|
|
||||||
50,35,8
|
|
||||||
|
|
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price
|
Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
|
||||||
100,Non-product,NoCat,$1,$1
|
100,Non-product,NoCat,$1,$1
|
||||||
|
|
|
||||||
|
|
|
@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
|
||||||
or "True"
|
or "True"
|
||||||
)
|
)
|
||||||
header = header_val.lower() != "false"
|
header = header_val.lower() != "false"
|
||||||
df.write_csv(str(path), separator=delim, include_header=header)
|
line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
|
||||||
|
eol = "\r\n" if line_end == "CRLF" else "\n"
|
||||||
|
df.write_csv(str(path), separator=delim, include_header=header,
|
||||||
|
line_terminator=eol)
|
||||||
elif fmt == 25: # Excel
|
elif fmt == 25: # Excel
|
||||||
df.write_excel(str(path))
|
df.write_excel(str(path))
|
||||||
elif fmt == 2: # Parquet
|
elif fmt == 2: # Parquet
|
||||||
|
|
|
||||||
|
|
@ -78,62 +78,68 @@ class JoinTool(BaseTool):
|
||||||
if select_fields is None:
|
if select_fields is None:
|
||||||
return df
|
return df
|
||||||
|
|
||||||
# Build column mapping
|
order_changed_el = cfg.find("OrderChanged")
|
||||||
# First, collect explicitly selected fields
|
order_changed = (
|
||||||
explicit_selections = [] # list of (src_col, output_name)
|
order_changed_el is not None
|
||||||
|
and order_changed_el.attrib.get("value", "False") == "True"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse field rules
|
||||||
|
rename_map: dict[str, str] = {} # src_col → output_name
|
||||||
|
exclude_set: set[str] = set() # columns explicitly excluded
|
||||||
|
explicit_order: list[str] = [] # for OrderChanged=True
|
||||||
has_unknown = False
|
has_unknown = False
|
||||||
|
unknown_selected = True
|
||||||
|
|
||||||
for sf in select_fields.findall("SelectField"):
|
for sf in select_fields.findall("SelectField"):
|
||||||
field = sf.attrib.get("field", "")
|
field = sf.attrib.get("field", "")
|
||||||
selected = sf.attrib.get("selected", "False") == "True"
|
selected = sf.attrib.get("selected", "True") == "True"
|
||||||
rename = sf.attrib.get("rename", "")
|
rename = sf.attrib.get("rename", "")
|
||||||
input_prefix = sf.attrib.get("input", "")
|
|
||||||
|
|
||||||
if not selected:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if field == "*Unknown":
|
if field == "*Unknown":
|
||||||
has_unknown = True
|
has_unknown = True
|
||||||
|
unknown_selected = selected
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Resolve column name in the DataFrame
|
||||||
|
if field not in df.columns:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not selected:
|
||||||
|
exclude_set.add(field)
|
||||||
else:
|
else:
|
||||||
# Find the column with prefix
|
explicit_order.append(field)
|
||||||
src_col = f"{input_prefix}{field}" if input_prefix else field
|
if rename and rename != field:
|
||||||
if src_col in df.columns:
|
rename_map[field] = rename
|
||||||
output_name = rename if rename else field
|
|
||||||
explicit_selections.append((src_col, output_name))
|
|
||||||
elif field in df.columns:
|
|
||||||
output_name = rename if rename else field
|
|
||||||
explicit_selections.append((field, output_name))
|
|
||||||
|
|
||||||
# Build final column list
|
# Build final column list
|
||||||
selected_cols = []
|
mentioned = set(explicit_order) | exclude_set
|
||||||
rename_map = {}
|
|
||||||
|
|
||||||
# Add explicitly selected columns
|
if order_changed:
|
||||||
for src, dst in explicit_selections:
|
# Explicit selections first (in specified order), then *Unknown
|
||||||
selected_cols.append(src)
|
final_cols = list(explicit_order)
|
||||||
if src != dst:
|
if has_unknown and unknown_selected:
|
||||||
rename_map[src] = dst
|
|
||||||
|
|
||||||
# Handle *Unknown: include all remaining columns, stripping prefixes
|
|
||||||
if has_unknown:
|
|
||||||
explicit_srcs = {src for src, _ in explicit_selections}
|
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
if col not in explicit_srcs:
|
if col not in mentioned:
|
||||||
# Strip Left_/Right_ prefix for output name
|
final_cols.append(col)
|
||||||
output_name = col
|
else:
|
||||||
if col.startswith("Left_"):
|
# Preserve original DataFrame column order
|
||||||
output_name = col[5:]
|
final_cols = []
|
||||||
elif col.startswith("Right_"):
|
for col in df.columns:
|
||||||
output_name = col[6:]
|
if col in exclude_set:
|
||||||
selected_cols.append(col)
|
continue
|
||||||
if col != output_name:
|
if col in mentioned or (has_unknown and unknown_selected):
|
||||||
rename_map[col] = output_name
|
final_cols.append(col)
|
||||||
|
elif not has_unknown and col not in mentioned:
|
||||||
|
# Default: include if not explicitly excluded
|
||||||
|
final_cols.append(col)
|
||||||
|
|
||||||
# Apply selection and renaming
|
if final_cols:
|
||||||
if selected_cols:
|
df = df.select(final_cols)
|
||||||
df = df.select(selected_cols)
|
|
||||||
if rename_map:
|
if rename_map:
|
||||||
df = df.rename(rename_map)
|
df = df.rename(
|
||||||
|
{k: v for k, v in rename_map.items() if k in df.columns}
|
||||||
|
)
|
||||||
break
|
break
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
@ -145,45 +151,40 @@ class JoinTool(BaseTool):
|
||||||
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
|
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
|
||||||
con = self.ctx.duckdb_con
|
con = self.ctx.duckdb_con
|
||||||
|
|
||||||
# Disambiguate conflicting column names
|
|
||||||
key_l = {k[0] for k in join_keys}
|
key_l = {k[0] for k in join_keys}
|
||||||
key_r = {k[1] for k in join_keys}
|
key_r = {k[1] for k in join_keys}
|
||||||
l_non_key = [c for c in left.columns if c not in key_l]
|
l_non_key = [c for c in left.columns if c not in key_l]
|
||||||
r_non_key = [c for c in right.columns if c not in key_r]
|
r_non_key = [c for c in right.columns if c not in key_r]
|
||||||
|
# Only right non-key columns that clash with left columns need a prefix
|
||||||
conflicts = set(l_non_key) & set(r_non_key)
|
conflicts = set(l_non_key) & set(r_non_key)
|
||||||
|
|
||||||
# Prefix all non-key columns: Left_ for left, Right_ for right
|
# Register the original (un-prefixed) tables
|
||||||
# This matches Alteryx behavior where SelectConfiguration references
|
con.register("__join_left__", left.to_arrow())
|
||||||
# fields with these prefixes
|
con.register("__join_right__", right.to_arrow())
|
||||||
rename_l = {c: f"Left_{c}" for c in l_non_key}
|
|
||||||
rename_r = {c: f"Right_{c}" for c in r_non_key}
|
|
||||||
# But keep join keys without prefix (they come from left)
|
|
||||||
left_r = left.rename(rename_l) if rename_l else left
|
|
||||||
right_r = right.rename(rename_r) if rename_r else right
|
|
||||||
|
|
||||||
con.register("__join_left__", left_r.to_arrow())
|
|
||||||
con.register("__join_right__", right_r.to_arrow())
|
|
||||||
|
|
||||||
# Map renamed key column names
|
|
||||||
def lk(k: str) -> str:
|
|
||||||
return rename_l.get(k, k)
|
|
||||||
|
|
||||||
def rk(k: str) -> str:
|
|
||||||
return rename_r.get(k, k)
|
|
||||||
|
|
||||||
on_clause = " AND ".join(
|
on_clause = " AND ".join(
|
||||||
f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys
|
f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
|
||||||
)
|
)
|
||||||
|
|
||||||
# Include right join keys with Right_ prefix for SelectConfiguration
|
# --- Inner join SELECT ------------------------------------------------
|
||||||
r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys)
|
# Left columns first (no prefix), then right join keys with Right_
|
||||||
r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key)
|
# prefix, then right non-key columns (Right_ prefix only on conflicts).
|
||||||
if r_key_cols_sql:
|
l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
|
||||||
r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}"
|
r_key_cols_sql = ", ".join(
|
||||||
|
f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
|
||||||
|
)
|
||||||
|
r_non_key_sql = ", ".join(
|
||||||
|
f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
|
||||||
|
for c in r_non_key
|
||||||
|
)
|
||||||
|
j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
|
||||||
|
j_select = ", ".join(j_parts)
|
||||||
|
|
||||||
r_key0 = rk(join_keys[0][1])
|
l_key0 = join_keys[0][0]
|
||||||
l_key0 = lk(join_keys[0][0])
|
r_key0 = join_keys[0][1]
|
||||||
j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
|
|
||||||
|
j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
|
||||||
|
# Left/right unmatched keep original column names (no prefixes)
|
||||||
l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
|
l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
|
||||||
r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'
|
r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue