{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [], "source": [ "# Open Alteryx XML into a string \n", "import polars as pl \n", "import xml.etree.ElementTree as ET\n", "\n", "xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n", "tree = ET.parse(xml_file_path)\n", "root = tree.getroot()\n", "\n", "xml_string = ET.tostring(root, encoding='unicode')\n" ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [], "source": [ "# Parse out nodes (tool data) into a dict\n", "def extract_tool_id_and_contents(xml_string):\n", " root = ET.fromstring(xml_string)\n", " results = []\n", " for node in root.iter('Node'):\n", " tool_id = node.attrib.get('ToolID')\n", " content = ET.tostring(node, encoding='unicode')\n", " results.append((tool_id, content))\n", " return results\n", "\n", "\n", "results = extract_tool_id_and_contents(xml_string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tool Functions" ] }, { "cell_type": "code", "execution_count": 254, "metadata": {}, "outputs": [], "source": [ "def tool_select(col_spec: dict):\n", " \"\"\" Generates select tool code\"\"\"\n", " dynamic_code = \"df_output = df.with_columns(\\n\"\n", " dynamic_code_suffix = ''\n", " for old_name, (new_name, type, selected) in col_spec.items():\n", "\n", " if old_name == '*Unknown':\n", " break \n", " \n", " if new_name:\n", " alias = f\".alias('{new_name}')\"\n", " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", " else:\n", " alias = ''\n", "\n", " if type is not None:\n", " if 'Int' in type:\n", " cast = f\".cast(pl.{pl.Int64})\"\n", " elif 'String' in type:\n", " cast = f\".cast(pl.{pl.String})\"\n", " else:\n", " cast = ''\n", "\n", " if selected != 'False':\n", " dynamic_code += f\"df.select(pl.col(f'{old_name}'){cast}{alias}),\\n\"\n", " else:\n", " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", "\n", " dynamic_code += \")\\n\" + dynamic_code_suffix\n", " \n", " return dynamic_code" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Parsing Tool Config data" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [], "source": [ "def input_textInput(xml_string):\n", " # Get XML for a Text input tool\n", " root = ET.fromstring(xml_string)\n", " # Extract the field names\n", " fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n", " # Extract the data rows\n", " data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", " # Create the polars dataframe\n", " df = pl.DataFrame(data_rows, fields, orient=\"row\")\n", " # Display the dataframe\n", " return df" ] }, { "cell_type": "code", "execution_count": 296, "metadata": {}, "outputs": [], "source": [ "def getConf_Select(xml_string):\n", " root = ET.fromstring(xml_string)\n", " dict_SelectTool = {}\n", "\n", " for field in root.findall(\".//SelectFields/SelectField\"):\n", " field_name = field.attrib['field']\n", " field_selected = field.attrib['selected']\n", "\n", " try:\n", " field_type = field.attrib['type']\n", " except:\n", " field_type = None\n", " try:\n", " field_rename = field.attrib['rename']\n", " except:\n", " field_rename = None\n", "\n", " dict_SelectTool[field_name] = (field_rename, field_type, field_selected)\n", "\n", " return dict_SelectTool" ] }, { "cell_type": "code", "execution_count": 313, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "joinByRecordPos: False\n", "{'joinParams': ('Col_3_renamed', 'Column 3')}\n" ] } ], "source": [ "### WIP\n", "\n", "def getConf_Join(xml_string):\n", " print(xml_string)\n", " root = ET.fromstring(xml_string)\n", " dict_JoinTool = {}\n", "\n", " print(\"joinByRecordPos:\", root.find(\".//Properties/Configuration\").attrib['joinByRecordPos'])\n", "\n", " # Join parameters\n", " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", " if joinField.attrib['connection'] == \"Left\":\n", " left_on = joinField.find('Field').attrib['field']\n", " elif joinField.attrib['connection'] == \"Right\":\n", " right_on = joinField.find('Field').attrib['field']\n", "\n", " if left_on == right_on:\n", " dict_JoinTool['joinParams'] = ('on', left_on)\n", " else:\n", " dict_JoinTool['joinParams'] = (left_on, right_on)\n", "\n", " # Select parameters\n", " for joinField in root.findall(\".//Configuration/SelectConfiguration\"):\n", "\n", " return dict_JoinTool\n", "\n", "xml_join_tool = results[6][1]\n", "print(getConf_Join(xml_join_tool))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Working with the XML file" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 TextInput\n", "shape: (3, 2)\n", "┌──────────┬──────────┐\n", "│ Column 3 ┆ Column 4 │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞══════════╪══════════╡\n", "│ 5 ┆ 8 │\n", "│ 6 ┆ 9 │\n", "│ 7 ┆ 10 │\n", "└──────────┴──────────┘\n", "2 AlteryxSelect\n", "3 BrowseV2\n", "4 TextInput\n", "shape: (3, 2)\n", "┌──────────┬──────────┐\n", "│ Column 3 ┆ Column 4 │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞══════════╪══════════╡\n", "│ 5 ┆ 8 │\n", "│ 6 ┆ 9 │\n", "│ 7 ┆ 10 │\n", "└──────────┴──────────┘\n", "5 AlteryxSelect\n", "6 Formula\n", "7 Join\n", "10 BrowseV2\n", "11 Formula\n", "12 Sample\n", "13 BrowseV2\n", "14 DbFileOutput\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_579015/219306832.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", " df = pl.DataFrame(data_rows, fields)\n" ] } ], "source": [ "# Parse all tools in tools dict\n", "for tool in results:\n", "\n", " ToolID = tool[0]\n", " ToolXML = tool[1]\n", " ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n", "\n", " print(ToolID, ToolType)\n", "\n", " if ToolType == 'TextInput':\n", " print(input_textInput(ToolXML))" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "############### Input dataframe (TextInput):\n", "\n", " shape: (3, 2)\n", "┌──────────┬──────────┐\n", "│ Column 3 ┆ Column 4 │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞══════════╪══════════╡\n", "│ 5 ┆ 8 │\n", "│ 6 ┆ 9 │\n", "│ 7 ┆ 10 │\n", "└──────────┴──────────┘\n", "\n", "############### Generated code from Select tool: \n", "\n", " df_output = df.with_columns(\n", "df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n", ")\n", "df_output = df_output.drop(f'Column 3')\n", "df_output = df_output.drop(f'Column 4')\n", "\n", "shape: (3, 1)\n", "┌───────────────┐\n", "│ Col_3_renamed │\n", "│ --- │\n", "│ i64 │\n", "╞═══════════════╡\n", "│ 5 │\n", "│ 6 │\n", "│ 7 │\n", "└───────────────┘\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_579015/3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", " df = pl.DataFrame(data_rows, fields)\n" ] } ], "source": [ "# Tool 1: TextInput\n", "xml_tool1 = results[3][1]\n", "df_in = TextInputToDf(ToolXML)\n", "\n", "print(\"\\n############### Input dataframe (TextInput):\\n\\n\", df_in)\n", "\n", "# Tool 2: Select\n", "col_spec = getConf_Select(results[4][1])\n", "\n", "# print(\"\\n############### Select tool spec taken from XML:\\n\\n\", col_spec)\n", "\n", "\n", "# Generate code from \n", "code = tool_select(col_spec)\n", "\n", "print(\"\\n############### Generated code from Select tool: \\n\\n\", code)\n", "\n", "df = df_in\n", "\n", "exec(code)\n", "\n", "print(df_output)\n", "\n" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "polaryx", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }