{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Open Alteryx XML into a string \n", "import polars as pl \n", "import xml.etree.ElementTree as ET\n", "\n", "xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n", "tree = ET.parse(xml_file_path)\n", "root = tree.getroot()\n", "\n", "xml_string = ET.tostring(root, encoding='unicode')\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "None\n", "('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_f76621528201864d89236be538ce397e~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n", "('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_f4c1fee3627dfb489419190371153cab~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n", "('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n", "('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_786c4b26fed826499b55ead59d943d58~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n" ] } ], "source": [ "# Parse out nodes (tool data) into a dict\n", "def extract_tool_id_and_contents(xml_string):\n", " root = ET.fromstring(xml_string)\n", " results = []\n", " for node in root.iter('Node'):\n", " tool_type = node.attrib.get('.//GuiSettings')\n", " print(tool_type)\n", " tool_id = node.attrib.get('ToolID')\n", " content = ET.tostring(node, encoding='unicode')\n", " results.append((tool_id, content))\n", " return results\n", "\n", "\n", "results = extract_tool_id_and_contents(xml_string)\n", "\n", "\n", "for tool in results:\n", " print(tool)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tool Functions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def tool_select(col_spec: dict):\n", " \"\"\" Generates select tool code\"\"\"\n", " dynamic_code = \"df_output = df.with_columns(\\n\"\n", " dynamic_code_suffix = ''\n", " for old_name, (new_name, type, selected) in col_spec.items():\n", "\n", " if old_name == '*Unknown':\n", " break \n", " \n", " if new_name:\n", " alias = f\".alias('{new_name}')\"\n", " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", " else:\n", " alias = ''\n", "\n", " if type is not None:\n", " if 'Int' in type:\n", " cast = f\".cast(pl.{pl.Int64})\"\n", " elif 'String' in type:\n", " cast = f\".cast(pl.{pl.String})\"\n", " else:\n", " cast = ''\n", "\n", " if selected != 'False':\n", " dynamic_code += f\"df.select(pl.col(f'{old_name}'){cast}{alias}),\\n\"\n", " else:\n", " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", "\n", " dynamic_code += \")\\n\" + dynamic_code_suffix\n", " \n", " return dynamic_code" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "ename": "IndentationError", "evalue": "expected an indented block after function definition on line 1 (2015830356.py, line 5)", "output_type": "error", "traceback": [ "\u001b[1;36m Cell \u001b[1;32mIn[6], line 5\u001b[1;36m\u001b[0m\n\u001b[1;33m xml_join_tool = results[6][1]\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m expected an indented block after function definition on line 1\n" ] } ], "source": [ "def tool_join(join_spec: dict, df_L: pl.DataFrame, df_R: pl.DataFrame):\n", " \n", "\n", "\n", "xml_join_tool = results[6][1]\n", "join_spec = getConf_Join(xml_join_tool)\n", "\n", "print(join_spec)\n", "\n", "print(tool_join(join_spec))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "L in: shape: (3, 3)\n", "┌─────┬─────┬─────┐\n", "│ foo ┆ bar ┆ ham │\n", "│ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ str │\n", "╞═════╪═════╪═════╡\n", "│ 1 ┆ 6.0 ┆ a │\n", "│ 2 ┆ 7.0 ┆ b │\n", "│ 3 ┆ 8.0 ┆ c │\n", "└─────┴─────┴─────┘\n", "R in: shape: (3, 2)\n", "┌───────┬─────┐\n", "│ apple ┆ ham │\n", "│ --- ┆ --- │\n", "│ str ┆ str │\n", "╞═══════╪═════╡\n", "│ x ┆ a │\n", "│ y ┆ b │\n", "│ z ┆ d │\n", "└───────┴─────┘\n", "L: shape: (1, 3)\n", "┌─────┬─────┬─────┐\n", "│ foo ┆ bar ┆ ham │\n", "│ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ str │\n", "╞═════╪═════╪═════╡\n", "│ 3 ┆ 8.0 ┆ c │\n", "└─────┴─────┴─────┘\n" ] } ], "source": [ "df = pl.DataFrame(\n", " {\n", " \"foo\": [1, 2, 3],\n", " \"bar\": [6.0, 7.0, 8.0],\n", " \"ham\": [\"a\", \"b\", \"c\"],\n", " }\n", ")\n", "\n", "other_df = pl.DataFrame(\n", " {\n", " \"apple\": [\"x\", \"y\", \"z\"],\n", " \"ham\": [\"a\", \"b\", \"d\"],\n", " }\n", ")\n", "\n", "print(\"L in:\", df)\n", "\n", "print(\"R in:\", other_df)\n", "\n", "print(\"L:\", df.join(other_df, left_on=\"ham\", right_on=\"ham\", how='anti'))\n", "\n", "# print(\"J: \", df.join(other_df, on=\"ham\", how='inner'))\n", "\n", "# print(\"R:\", other_df.join(df, on=\"ham\", how='anti'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Parsing Tool Config data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def input_textInput(xml_string):\n", " # Get XML for a Text input tool\n", " root = ET.fromstring(xml_string)\n", " # Extract the field names\n", " fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n", " # Extract the data rows\n", " data_rows = [[c.text for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", " # Create the polars dataframe\n", " df = pl.DataFrame(data_rows, fields, orient=\"row\")\n", " # Display the dataframe\n", " return df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def getConf_Select(xml_string):\n", " root = ET.fromstring(xml_string)\n", " dict_SelectTool = {}\n", "\n", " for field in root.findall(\".//SelectFields/SelectField\"):\n", " field_name = field.attrib['field']\n", " field_selected = field.attrib['selected']\n", "\n", " try:\n", " field_type = field.attrib['type']\n", " except:\n", " field_type = None\n", " try:\n", " field_rename = field.attrib['rename']\n", " except:\n", " field_rename = None\n", "\n", " dict_SelectTool[field_name] = (field_rename, field_type, field_selected)\n", "\n", " return dict_SelectTool" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'joinParams': ['Ham', 'Cheese'], 'fields': {'Right_Ham': ['Right_Ham', 'Right_', None, None], 'Right_Cheese': ['Right_Cheese', 'Right_', None, None], 'Right_Column 3': ['Right_Column 3', 'Right_', 'V_String', '11'], 'Right_Column 4': ['Right_Column 4', 'Right_', 'Int64', '8']}}\n" ] } ], "source": [ "def getConf_Join(xml_string):\n", " root = ET.fromstring(xml_string)\n", " dict_JoinTool = {}\n", "\n", " # Join parameters\n", " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", " if joinField.attrib['connection'] == \"Left\":\n", " left_on = [field.attrib['field'] for field in joinField]\n", " if joinField.attrib['connection'] == \"Right\":\n", " right_on = [field.attrib['field'] for field in joinField]\n", "\n", " if left_on == right_on:\n", " dict_JoinTool['joinParams'] = (left_on)\n", " else:\n", " dict_JoinTool['joinParams'] = (left_on, right_on)\n", "\n", " \n", " # Field Parameters\n", " fieldConfig = {}\n", " for fields in root.findall(\".//SelectField\"):\n", " result = []\n", " if fields.attrib['selected'] == 'True':\n", " for field in ['rename', 'input', 'type', 'size']:\n", " try:\n", " result.append(fields.attrib[field])\n", " except KeyError:\n", " result.append(None)\n", "\n", " if fields.attrib['field'] != '*Unknown':\n", " fieldConfig[fields.attrib['field']] = result\n", "\n", "\n", " dict_JoinTool['fields'] = fieldConfig\n", "\n", " return dict_JoinTool\n", "\n", "print(getConf_Join(xml_join_tool))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Working with the XML file" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 TextInput\n", "shape: (3, 2)\n", "┌─────────┬──────────┐\n", "│ Column1 ┆ Column 2 │\n", "│ --- ┆ --- │\n", "│ str ┆ str │\n", "╞═════════╪══════════╡\n", "│ 1 ┆ 4 │\n", "│ 2 ┆ 5 │\n", "│ 3 ┆ 6 │\n", "└─────────┴──────────┘\n", "2 AlteryxSelect\n", "3 BrowseV2\n", "4 TextInput\n", "shape: (3, 4)\n", "┌──────────┬──────────┬─────┬────────┐\n", "│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str │\n", "╞══════════╪══════════╪═════╪════════╡\n", "│ 5 ┆ 8 ┆ z ┆ a │\n", "│ 6 ┆ 9 ┆ b ┆ d │\n", "│ 7 ┆ 10 ┆ c ┆ c │\n", "└──────────┴──────────┴─────┴────────┘\n", "5 AlteryxSelect\n", "6 Formula\n", "7 Join\n", "10 BrowseV2\n", "11 Formula\n", "12 Sample\n", "13 BrowseV2\n", "14 DbFileOutput\n", "15 TextInput\n", "shape: (3, 3)\n", "┌─────┬────────┬───────┐\n", "│ Ham ┆ Cheese ┆ Count │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str │\n", "╞═════╪════════╪═══════╡\n", "│ a ┆ a ┆ 4 │\n", "│ b ┆ d ┆ 6 │\n", "│ c ┆ z ┆ 7 │\n", "└─────┴────────┴───────┘\n" ] } ], "source": [ "# Parse all tools in tools dict\n", "for tool in results:\n", "\n", " ToolID = tool[0]\n", " ToolXML = tool[1]\n", " ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n", "\n", " print(ToolID, ToolType)\n", "\n", " if ToolType == 'TextInput':\n", " print(input_textInput(ToolXML))" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "############### Input dataframe (TextInput):\n", " shape: (3, 4)\n", "┌──────────┬──────────┬─────┬────────┐\n", "│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str │\n", "╞══════════╪══════════╪═════╪════════╡\n", "│ 5 ┆ 8 ┆ z ┆ a │\n", "│ 6 ┆ 9 ┆ b ┆ d │\n", "│ 7 ┆ 10 ┆ c ┆ c │\n", "└──────────┴──────────┴─────┴────────┘\n", "\n", "############### Generated code from Select tool: \n", " df_output = df.with_columns(\n", "df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n", ")\n", "df_output = df_output.drop(f'Column 3')\n", "df_output = df_output.drop(f'Column 4')\n", "\n", "\n", "############### Output DF: \n", "\n", " shape: (3, 3)\n", "┌─────┬────────┬───────────────┐\n", "│ Ham ┆ Cheese ┆ Col_3_renamed │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str │\n", "╞═════╪════════╪═══════════════╡\n", "│ z ┆ a ┆ 5 │\n", "│ b ┆ d ┆ 6 │\n", "│ c ┆ c ┆ 7 │\n", "└─────┴────────┴───────────────┘\n" ] } ], "source": [ "# Tool 1: TextInput\n", "xml_tool1 = results[3][1]\n", "df_in = input_textInput(xml_tool1)\n", "print(\"\\n############### Input dataframe (TextInput):\\n\", df_in)\n", "# Tool 2: Select\n", "col_spec = getConf_Select(results[4][1])\n", "# Generate code from \n", "code = tool_select(col_spec)\n", "print(\"\\n############### Generated code from Select tool: \\n\", code)\n", "# Execute and display output\n", "df = df_in\n", "exec(code)\n", "print(\"\\n############### Output DF: \\n\\n\", df_output)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_93f5c2999bcb8c478c002b15166e2b45~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n", "('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_ab9b90f8fc991440905d667f1d5c7325~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n", "('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n", "('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_dd72ded80941104b9b9be56761379cb2~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", "('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n False\\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n ')\n", "('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n" ] } ], "source": [ "for tool in results:\n", " print(tool)" ] } ], "metadata": { "kernelspec": { "display_name": "polaryx", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }