join fix and runner update to support multiple output files

2026-06-13 08:40:01 +10:00 · 2026-06-13 08:40:01 +10:00 · dd1431760a
parent ab3d7ab971
commit dd1431760a
7 changed files with 1717 additions and 1709 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
+# Alteryx example files
+Alteryx_TestWorkflows/JoinTesting/Output/
+!Alteryx_TestWorkflows/JoinTesting/Output/**/
+
 # uv
 uv.lock

--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_J.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_J.csv
--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_L.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_L.csv
@ -1,4 +1,4 @@
-Left_Store_ID,Product_ID,Left_Stock_On_Hand
+Store_ID,Product_ID,Stock_On_Hand
 15,31,4
 15,32,16
 15,33,8
@ -62,6 +62,46 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
 28,32,3
 28,33,9
 28,34,19
+42,31,11
+42,32,4
+42,33,18
+42,34,34
+42,35,13
+43,31,18
+43,32,38
+43,33,5
+43,34,7
+44,31,8
+44,32,29
+44,33,0
+44,34,22
+45,31,6
+45,32,6
+45,33,7
+45,34,3
+46,31,13
+46,32,8
+46,33,11
+46,34,24
+47,31,48
+47,32,6
+47,33,13
+47,34,3
+48,31,41
+48,32,7
+48,33,0
+48,34,39
+48,35,3
+49,31,51
+49,32,11
+49,33,15
+49,34,2
+49,35,19
+50,31,18
+50,32,9
+50,33,1
+50,34,17
+50,35,8
 29,31,3
 29,32,7
 29,33,6
@ -184,43 +224,3 @@ Left_Store_ID,Product_ID,Left_Stock_On_Hand
 14,32,2
 14,33,2
 14,34,8
-42,31,11
-42,32,4
-42,33,18
-42,34,34
-42,35,13
-43,31,18
-43,32,38
-43,33,5
-43,34,7
-44,31,8
-44,32,29
-44,33,0
-44,34,22
-45,31,6
-45,32,6
-45,33,7
-45,34,3
-46,31,13
-46,32,8
-46,33,11
-46,34,24
-47,31,48
-47,32,6
-47,33,13
-47,34,3
-48,31,41
-48,32,7
-48,33,0
-48,34,39
-48,35,3
-49,31,51
-49,32,11
-49,33,15
-49,34,2
-49,35,19
-50,31,18
-50,32,9
-50,33,1
-50,34,17
-50,35,8
--- a/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_R.csv
+++ b/Alteryx_TestWorkflows/JoinTesting/Output/Join_out_R.csv
@ -1,2 +1,2 @@
-Product_ID,Right_Product_Name,Right_Product_Category,Right_Product_Cost,Right_Product_Price
+Product_ID,Product_Name,Product_Category,Product_Cost,Product_Price
 100,Non-product,NoCat,$1,$1
--- a/alteryx_runner/tools/inout/output_data.py
+++ b/alteryx_runner/tools/inout/output_data.py
@ -74,7 +74,10 @@ class OutputDataTool(BaseTool):
                or "True"
            )
            header = header_val.lower() != "false"
-            df.write_csv(str(path), separator=delim, include_header=header)
+            line_end = (opts.findtext("LineEndStyle") or "LF").strip().upper()
+            eol = "\r\n" if line_end == "CRLF" else "\n"
+            df.write_csv(str(path), separator=delim, include_header=header,
+                         line_terminator=eol)
        elif fmt == 25:     # Excel
            df.write_excel(str(path))
        elif fmt == 2:      # Parquet
--- a/alteryx_runner/tools/join/join_tool.py
+++ b/alteryx_runner/tools/join/join_tool.py
@ -78,62 +78,68 @@ class JoinTool(BaseTool):
                if select_fields is None:
                    return df

-                # Build column mapping
-                # First, collect explicitly selected fields
-                explicit_selections = []  # list of (src_col, output_name)
+                order_changed_el = cfg.find("OrderChanged")
+                order_changed = (
+                    order_changed_el is not None
+                    and order_changed_el.attrib.get("value", "False") == "True"
+                )
+
+                # Parse field rules
+                rename_map: dict[str, str] = {}  # src_col → output_name
+                exclude_set: set[str] = set()    # columns explicitly excluded
+                explicit_order: list[str] = []   # for OrderChanged=True
                has_unknown = False
+                unknown_selected = True

                for sf in select_fields.findall("SelectField"):
                    field = sf.attrib.get("field", "")
-                    selected = sf.attrib.get("selected", "False") == "True"
+                    selected = sf.attrib.get("selected", "True") == "True"
                    rename = sf.attrib.get("rename", "")
-                    input_prefix = sf.attrib.get("input", "")
-                    
-                    if not selected:
-                        continue

                    if field == "*Unknown":
                        has_unknown = True
+                        unknown_selected = selected
+                        continue
+
+                    # Resolve column name in the DataFrame
+                    if field not in df.columns:
+                        continue
+
+                    if not selected:
+                        exclude_set.add(field)
                    else:
-                        # Find the column with prefix
-                        src_col = f"{input_prefix}{field}" if input_prefix else field
-                        if src_col in df.columns:
-                            output_name = rename if rename else field
-                            explicit_selections.append((src_col, output_name))
-                        elif field in df.columns:
-                            output_name = rename if rename else field
-                            explicit_selections.append((field, output_name))
+                        explicit_order.append(field)
+                        if rename and rename != field:
+                            rename_map[field] = rename

                # Build final column list
-                selected_cols = []
-                rename_map = {}
+                mentioned = set(explicit_order) | exclude_set

-                # Add explicitly selected columns
-                for src, dst in explicit_selections:
-                    selected_cols.append(src)
-                    if src != dst:
-                        rename_map[src] = dst
-                
-                # Handle *Unknown: include all remaining columns, stripping prefixes
-                if has_unknown:
-                    explicit_srcs = {src for src, _ in explicit_selections}
+                if order_changed:
+                    # Explicit selections first (in specified order), then *Unknown
+                    final_cols = list(explicit_order)
+                    if has_unknown and unknown_selected:
                        for col in df.columns:
-                        if col not in explicit_srcs:
-                            # Strip Left_/Right_ prefix for output name
-                            output_name = col
-                            if col.startswith("Left_"):
-                                output_name = col[5:]
-                            elif col.startswith("Right_"):
-                                output_name = col[6:]
-                            selected_cols.append(col)
-                            if col != output_name:
-                                rename_map[col] = output_name
+                            if col not in mentioned:
+                                final_cols.append(col)
+                else:
+                    # Preserve original DataFrame column order
+                    final_cols = []
+                    for col in df.columns:
+                        if col in exclude_set:
+                            continue
+                        if col in mentioned or (has_unknown and unknown_selected):
+                            final_cols.append(col)
+                        elif not has_unknown and col not in mentioned:
+                            # Default: include if not explicitly excluded
+                            final_cols.append(col)

-                # Apply selection and renaming
-                if selected_cols:
-                    df = df.select(selected_cols)
+                if final_cols:
+                    df = df.select(final_cols)
                if rename_map:
-                        df = df.rename(rename_map)
+                    df = df.rename(
+                        {k: v for k, v in rename_map.items() if k in df.columns}
+                    )
                break
        return df

@ -145,45 +151,40 @@ class JoinTool(BaseTool):
    ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
        con = self.ctx.duckdb_con

-        # Disambiguate conflicting column names
        key_l = {k[0] for k in join_keys}
        key_r = {k[1] for k in join_keys}
        l_non_key = [c for c in left.columns if c not in key_l]
        r_non_key = [c for c in right.columns if c not in key_r]
+        # Only right non-key columns that clash with left columns need a prefix
        conflicts = set(l_non_key) & set(r_non_key)

-        # Prefix all non-key columns: Left_ for left, Right_ for right
-        # This matches Alteryx behavior where SelectConfiguration references
-        # fields with these prefixes
-        rename_l = {c: f"Left_{c}" for c in l_non_key}
-        rename_r = {c: f"Right_{c}" for c in r_non_key}
-        # But keep join keys without prefix (they come from left)
-        left_r = left.rename(rename_l) if rename_l else left
-        right_r = right.rename(rename_r) if rename_r else right
-
-        con.register("__join_left__", left_r.to_arrow())
-        con.register("__join_right__", right_r.to_arrow())
-
-        # Map renamed key column names
-        def lk(k: str) -> str:
-            return rename_l.get(k, k)
-
-        def rk(k: str) -> str:
-            return rename_r.get(k, k)
+        # Register the original (un-prefixed) tables
+        con.register("__join_left__", left.to_arrow())
+        con.register("__join_right__", right.to_arrow())

        on_clause = " AND ".join(
-            f'l."{lk(k[0])}" = r."{rk(k[1])}"' for k in join_keys
+            f'l."{k[0]}" = r."{k[1]}"' for k in join_keys
        )

-        # Include right join keys with Right_ prefix for SelectConfiguration
-        r_key_cols_sql = ", ".join(f'r."{rk(k[1])}" AS "Right_{k[1]}"' for k in join_keys)
-        r_cols_sql = ", ".join(f'r."{rk(c)}"' for c in r_non_key)
-        if r_key_cols_sql:
-            r_cols_sql = f"{r_key_cols_sql}, {r_cols_sql}"
+        # --- Inner join SELECT ------------------------------------------------
+        # Left columns first (no prefix), then right join keys with Right_
+        # prefix, then right non-key columns (Right_ prefix only on conflicts).
+        l_cols_sql = ", ".join(f'l."{c}"' for c in left.columns)
+        r_key_cols_sql = ", ".join(
+            f'r."{k[1]}" AS "Right_{k[1]}"' for k in join_keys
+        )
+        r_non_key_sql = ", ".join(
+            f'r."{c}" AS "Right_{c}"' if c in conflicts else f'r."{c}"'
+            for c in r_non_key
+        )
+        j_parts = [p for p in (l_cols_sql, r_key_cols_sql, r_non_key_sql) if p]
+        j_select = ", ".join(j_parts)

-        r_key0 = rk(join_keys[0][1])
-        l_key0 = lk(join_keys[0][0])
-        j_sql = f"SELECT l.*, {r_cols_sql} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
+        l_key0 = join_keys[0][0]
+        r_key0 = join_keys[0][1]
+
+        j_sql = f"SELECT {j_select} FROM __join_left__ l INNER JOIN __join_right__ r ON {on_clause}"
+        # Left/right unmatched keep original column names (no prefixes)
        l_sql = f'SELECT l.* FROM __join_left__ l LEFT JOIN __join_right__ r ON {on_clause} WHERE r."{r_key0}" IS NULL'
        r_sql = f'SELECT r.* FROM __join_right__ r LEFT JOIN __join_left__ l ON {on_clause} WHERE l."{l_key0}" IS NULL'