join fix and runner update to support multiple output files

2026-06-13 08:40:01 +10:00 · 2026-06-13 08:40:01 +10:00 · dd1431760a
parent ab3d7ab971
commit dd1431760a
7 changed files with 1717 additions and 1709 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 # Alteryx example files
 Alteryx_TestWorkflows/JoinTesting/Output/
 !Alteryx_TestWorkflows/JoinTesting/Output/**/
 # uv
 uv.lock
--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_J.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_J.csv
--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_L.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_L.csv
@ -1,4 +1,4 @@
-Left_Store_ID,Product_ID,Left_Stock_On_Hand
+Store_ID,Product_ID,Stock_On_Hand
 15,31,4
 15,32,16
 15,33,8
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
 28,32,3
 28,33,9
 28,34,19
 42,31,11
 42,32,4
 42,33,18
 42,34,34
 42,35,13
 43,31,18
 43,32,38
 43,33,5
 43,34,7
 44,31,8
 44,32,29
 44,33,0
 44,34,22
 45,31,6
 45,32,6
 45,33,7
 45,34,3
 46,31,13
 46,32,8
 46,33,11
 46,34,24
 47,31,48
 47,32,6
 47,33,13
 47,34,3
 48,31,41
 48,32,7
 48,33,0
 48,34,39
 48,35,3
 49,31,51
 49,32,11
 49,33,15
 49,34,2
 49,35,19
 50,31,18
 50,32,9
 50,33,1
 50,34,17
 50,35,8
 29,31,3
 29,32,7
 29,33,6
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
 14,32,2
 14,33,2
 14,34,8
 42,31,11
 42,32,4
 42,33,18
 42,34,34
 42,35,13
 43,31,18
 43,32,38
 43,33,5
 43,34,7
 44,31,8
 44,32,29
 44,33,0
 44,34,22
 45,31,6
 45,32,6
 45,33,7
 45,34,3
 46,31,13
 46,32,8
 46,33,11
 46,34,24
 47,31,48
 47,32,6
 47,33,13
 47,34,3
 48,31,41
 48,32,7
 48,33,0
 48,34,39
 48,35,3
 49,31,51
 49,32,11
 49,33,15
 49,34,2
 49,35,19
 50,31,18
 50,32,9
 50,33,1
 50,34,17
 50,35,8
--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_R.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_R.csv
@ -1,2 +1,2 @@
-Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price
+Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
 100,Non-product,NoCat,$1,$1
--- a/alteryx_runner/tools/inout/output_data.py
+++ b/alteryx_runner/tools/inout/output_data.py
@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
                or "True"
            )
            header = header_val.lower() != "false"
-            df.write_csv(str(path), separator=delim, include_header=header)
+            line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
            eol = "\r\n" if line_end == "CRLF" else "\n"
            df.write_csv(str(path), separator=delim, include_header=header,
                         line_terminator=eol)
        elif fmt == 25:     # Excel
            df.write_excel(str(path))
        elif fmt == 2:      # Parquet
--- a/alteryx_runner/tools/join/join_tool.py
+++ b/alteryx_runner/tools/join/join_tool.py
@ -78,62 +78,68 @@ class JoinTool(BaseTool):
                if select_fields is None:
                    return df
-                # Build column mapping
+                order_changed_el = cfg.find("OrderChanged")
-                # First, collect explicitly selected fields
+                order_changed = (
-                explicit_selections = []  # list of (src_col, output_name)
+                    order_changed_el is not None
                    and order_changed_el.attrib.get("value", "False") == "True"
                )
                # Parse field rules
                rename_map: dict[str, str] = {}  # src_col → output_name
                exclude_set: set[str] = set()    # columns explicitly excluded
                explicit_order: list[str] = []   # for OrderChanged=True
                has_unknown = False
                unknown_selected = True
                for sf in select_fields.findall("SelectField"):
                    field = sf.attrib.get("field", "")
-                    selected = sf.attrib.get("selected", "False") == "True"
+                    selected = sf.attrib.get("selected", "True") == "True"
                    rename = sf.attrib.get("rename", "")
                    input_prefix = sf.attrib.get("input", "")
                    if not selected:
                        continue
                    if field == "*Unknown":
                        has_unknown = True
                        unknown_selected = selected
                        continue
                    # Resolve column name in the DataFrame
                    if field not in df.columns:
                        continue
                    if not selected:
                        exclude_set.add(field)
                    else:
-                        # Find the column with prefix
+                        explicit_order.append(field)
-                        src_col = f"{input_prefix}{field}" if input_prefix else field
+                        if rename and rename != field:
-                        if src_col in df.columns:
+                            rename_map[field] = rename
                            output_name = rename if rename else field
                            explicit_selections.append((src_col, output_name))
                        elif field in df.columns:
                            output_name = rename if rename else field
                            explicit_selections.append((field, output_name))
                # Build final column list
-                selected_cols = []
+                mentioned = set(explicit_order) | exclude_set
                rename_map = {}
-                # Add explicitly selected columns
+                if order_changed:
-                for src, dst in explicit_selections:
+                    # Explicit selections first (in specified order), then *Unknown
-                    selected_cols.append(src)
+                    final_cols = list(explicit_order)
-                    if src != dst:
+                    if has_unknown and unknown_selected:
                        rename_map[src] = dst
                # Handle *Unknown: include all remaining columns, stripping prefixes
                if has_unknown:
                    explicit_srcs = {src for src, _ in explicit_selections}
                        for col in df.columns:
-                        if col not in explicit_srcs:
+                            if col not in mentioned:
-                            # Strip Left_/Right_ prefix for output name
+                                final_cols.append(col)
-                            output_name = col
+                else:
-                            if col.startswith("Left_"):
+                    # Preserve original DataFrame column order
-                                output_name = col[5:]
+                    final_cols = []
-                            elif col.startswith("Right_"):
+                    for col in df.columns:
-                                output_name = col[6:]
+                        if col in exclude_set:
-                            selected_cols.append(col)
+                            continue
-                            if col != output_name:
+                        if col in mentioned or (has_unknown and unknown_selected):
-                                rename_map[col] = output_name
+                            final_cols.append(col)
                        elif not has_unknown and col not in mentioned:
                            # Default: include if not explicitly excluded
                            final_cols.append(col)
-                # Apply selection and renaming
+                if final_cols:
-                if selected_cols:
+                    df = df.select(final_cols)
                    df = df.select(selected_cols)
                if rename_map:
-                        df = df.rename(rename_map)
+                    df = df.rename(
                        {k: v for k, v in rename_map.items() if k in df.columns}
                    )
                break
        return df
@ -145,45 +151,40 @@ class JoinTool(BaseTool):
    ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
        con = self.ctx.duckdb_con
        # Disambiguate conflicting column names
        key_l = {k[0] for k in join_keys}
        key_r = {k[1] for k in join_keys}
        l_non_key = [c for c in left.columns if c not in key_l]
        r_non_key = [c for c in right.columns if c not in key_r]
        # Only right non-key columns that clash with left columns need a prefix
        conflicts = set(l_non_key) & set(r_non_key)
-        # Prefix all non-key columns: Left_ for left, Right_ for right
+        # Register the original (un-prefixed) tables
-        # This matches Alteryx behavior where SelectConfiguration references
+        con.register("__join_left__", left.to_arrow())
-        # fields with these prefixes
+        con.register("__join_right__", right.to_arrow())
        rename_l = {c: f"Left_{c}" for c in l_non_key}
        rename_r = {c: f"Right_{c}" for c in r_non_key}
        # But keep join keys without prefix (they come from left)
        left_r = left.rename(rename_l) if rename_l else left
        right_r = right.rename(rename_r) if rename_r else right
        con.register("__join_left__", left_r.to_arrow())
        con.register("__join_right__", right_r.to_arrow())
        # Map renamed key column names
        def lk(k: str) -> str:
            return rename_l.get(k, k)
        def rk(k: str) -> str:
            return rename_r.get(k, k)
        on_clause = " AND ".join(
-            f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys
+            f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
        )
-        # Include right join keys with Right_ prefix for SelectConfiguration
+        # --- Inner join SELECT ------------------------------------------------
-        r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys)
+        # Left columns first (no prefix), then right join keys with Right_
-        r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key)
+        # prefix, then right non-key columns (Right_ prefix only on conflicts).
-        if r_key_cols_sql:
+        l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
-            r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}"
+        r_key_cols_sql = ", ".join(
            f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
        )
        r_non_key_sql = ", ".join(
            f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
            for c in r_non_key
        )
        j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
        j_select = ", ".join(j_parts)
-        r_key0 = rk(join_keys[0][1])
+        l_key0 = join_keys[0][0]
-        l_key0 = lk(join_keys[0][0])
+        r_key0 = join_keys[0][1]
-        j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
+
        j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
        # Left/right unmatched keep original column names (no prefixes)
        l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
        r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'
`@ -1,2 +1,2 @@`
	`Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price`	`Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price`
	`100,Non-product,NoCat,$1,$1`	`100,Non-product,NoCat,$1,$1`