{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Open Alteryx XML into a string \n",
"import polars as pl \n",
"import xml.etree.ElementTree as ET\n",
"\n",
"xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n",
"tree = ET.parse(xml_file_path)\n",
"root = tree.getroot()\n",
"\n",
"xml_string = ET.tostring(root, encoding='unicode')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_f76621528201864d89236be538ce397e~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n",
"('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_f4c1fee3627dfb489419190371153cab~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n",
"('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n",
"('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\casey.morter\\\\AppData\\\\Local\\\\Temp\\\\Engine_24220_fcf44807b57fab4cb64cc2e4b0cb337e_\\\\Engine_24220_786c4b26fed826499b55ead59d943d58~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n"
]
}
],
"source": [
"# Parse out nodes (tool data) into a dict\n",
"def extract_tool_id_and_contents(xml_string):\n",
" root = ET.fromstring(xml_string)\n",
" results = []\n",
" for node in root.iter('Node'):\n",
" tool_type = node.attrib.get('.//GuiSettings')\n",
" print(tool_type)\n",
" tool_id = node.attrib.get('ToolID')\n",
" content = ET.tostring(node, encoding='unicode')\n",
" results.append((tool_id, content))\n",
" return results\n",
"\n",
"\n",
"results = extract_tool_id_and_contents(xml_string)\n",
"\n",
"\n",
"for tool in results:\n",
" print(tool)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tool Functions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def tool_select(col_spec: dict):\n",
" \"\"\" Generates select tool code\"\"\"\n",
" dynamic_code = \"df_output = df.with_columns(\\n\"\n",
" dynamic_code_suffix = ''\n",
" for old_name, (new_name, type, selected) in col_spec.items():\n",
"\n",
" if old_name == '*Unknown':\n",
" break \n",
" \n",
" if new_name:\n",
" alias = f\".alias('{new_name}')\"\n",
" dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n",
" else:\n",
" alias = ''\n",
"\n",
" if type is not None:\n",
" if 'Int' in type:\n",
" cast = f\".cast(pl.{pl.Int64})\"\n",
" elif 'String' in type:\n",
" cast = f\".cast(pl.{pl.String})\"\n",
" else:\n",
" cast = ''\n",
"\n",
" if selected != 'False':\n",
" dynamic_code += f\"df.select(pl.col(f'{old_name}'){cast}{alias}),\\n\"\n",
" else:\n",
" dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n",
"\n",
" dynamic_code += \")\\n\" + dynamic_code_suffix\n",
" \n",
" return dynamic_code"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "IndentationError",
"evalue": "expected an indented block after function definition on line 1 (2015830356.py, line 5)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Cell \u001b[1;32mIn[6], line 5\u001b[1;36m\u001b[0m\n\u001b[1;33m xml_join_tool = results[6][1]\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m expected an indented block after function definition on line 1\n"
]
}
],
"source": [
"def tool_join(join_spec: dict, df_L: pl.DataFrame, df_R: pl.DataFrame):\n",
" \n",
"\n",
"\n",
"xml_join_tool = results[6][1]\n",
"join_spec = getConf_Join(xml_join_tool)\n",
"\n",
"print(join_spec)\n",
"\n",
"print(tool_join(join_spec))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L in: shape: (3, 3)\n",
"┌─────┬─────┬─────┐\n",
"│ foo ┆ bar ┆ ham │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ str │\n",
"╞═════╪═════╪═════╡\n",
"│ 1 ┆ 6.0 ┆ a │\n",
"│ 2 ┆ 7.0 ┆ b │\n",
"│ 3 ┆ 8.0 ┆ c │\n",
"└─────┴─────┴─────┘\n",
"R in: shape: (3, 2)\n",
"┌───────┬─────┐\n",
"│ apple ┆ ham │\n",
"│ --- ┆ --- │\n",
"│ str ┆ str │\n",
"╞═══════╪═════╡\n",
"│ x ┆ a │\n",
"│ y ┆ b │\n",
"│ z ┆ d │\n",
"└───────┴─────┘\n",
"L: shape: (1, 3)\n",
"┌─────┬─────┬─────┐\n",
"│ foo ┆ bar ┆ ham │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ str │\n",
"╞═════╪═════╪═════╡\n",
"│ 3 ┆ 8.0 ┆ c │\n",
"└─────┴─────┴─────┘\n"
]
}
],
"source": [
"df = pl.DataFrame(\n",
" {\n",
" \"foo\": [1, 2, 3],\n",
" \"bar\": [6.0, 7.0, 8.0],\n",
" \"ham\": [\"a\", \"b\", \"c\"],\n",
" }\n",
")\n",
"\n",
"other_df = pl.DataFrame(\n",
" {\n",
" \"apple\": [\"x\", \"y\", \"z\"],\n",
" \"ham\": [\"a\", \"b\", \"d\"],\n",
" }\n",
")\n",
"\n",
"print(\"L in:\", df)\n",
"\n",
"print(\"R in:\", other_df)\n",
"\n",
"print(\"L:\", df.join(other_df, left_on=\"ham\", right_on=\"ham\", how='anti'))\n",
"\n",
"# print(\"J: \", df.join(other_df, on=\"ham\", how='inner'))\n",
"\n",
"# print(\"R:\", other_df.join(df, on=\"ham\", how='anti'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parsing Tool Config data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def input_textInput(xml_string):\n",
" # Get XML for a Text input tool\n",
" root = ET.fromstring(xml_string)\n",
" # Extract the field names\n",
" fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n",
" # Extract the data rows\n",
" data_rows = [[c.text for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n",
" # Create the polars dataframe\n",
" df = pl.DataFrame(data_rows, fields, orient=\"row\")\n",
" # Display the dataframe\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def getConf_Select(xml_string):\n",
" root = ET.fromstring(xml_string)\n",
" dict_SelectTool = {}\n",
"\n",
" for field in root.findall(\".//SelectFields/SelectField\"):\n",
" field_name = field.attrib['field']\n",
" field_selected = field.attrib['selected']\n",
"\n",
" try:\n",
" field_type = field.attrib['type']\n",
" except:\n",
" field_type = None\n",
" try:\n",
" field_rename = field.attrib['rename']\n",
" except:\n",
" field_rename = None\n",
"\n",
" dict_SelectTool[field_name] = (field_rename, field_type, field_selected)\n",
"\n",
" return dict_SelectTool"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'joinParams': ['Ham', 'Cheese'], 'fields': {'Right_Ham': ['Right_Ham', 'Right_', None, None], 'Right_Cheese': ['Right_Cheese', 'Right_', None, None], 'Right_Column 3': ['Right_Column 3', 'Right_', 'V_String', '11'], 'Right_Column 4': ['Right_Column 4', 'Right_', 'Int64', '8']}}\n"
]
}
],
"source": [
"def getConf_Join(xml_string):\n",
" root = ET.fromstring(xml_string)\n",
" dict_JoinTool = {}\n",
"\n",
" # Join parameters\n",
" for joinField in root.findall(\".//Configuration/JoinInfo\"):\n",
" if joinField.attrib['connection'] == \"Left\":\n",
" left_on = [field.attrib['field'] for field in joinField]\n",
" if joinField.attrib['connection'] == \"Right\":\n",
" right_on = [field.attrib['field'] for field in joinField]\n",
"\n",
" if left_on == right_on:\n",
" dict_JoinTool['joinParams'] = (left_on)\n",
" else:\n",
" dict_JoinTool['joinParams'] = (left_on, right_on)\n",
"\n",
" \n",
" # Field Parameters\n",
" fieldConfig = {}\n",
" for fields in root.findall(\".//SelectField\"):\n",
" result = []\n",
" if fields.attrib['selected'] == 'True':\n",
" for field in ['rename', 'input', 'type', 'size']:\n",
" try:\n",
" result.append(fields.attrib[field])\n",
" except KeyError:\n",
" result.append(None)\n",
"\n",
" if fields.attrib['field'] != '*Unknown':\n",
" fieldConfig[fields.attrib['field']] = result\n",
"\n",
"\n",
" dict_JoinTool['fields'] = fieldConfig\n",
"\n",
" return dict_JoinTool\n",
"\n",
"print(getConf_Join(xml_join_tool))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Working with the XML file"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 TextInput\n",
"shape: (3, 2)\n",
"┌─────────┬──────────┐\n",
"│ Column1 ┆ Column 2 │\n",
"│ --- ┆ --- │\n",
"│ str ┆ str │\n",
"╞═════════╪══════════╡\n",
"│ 1 ┆ 4 │\n",
"│ 2 ┆ 5 │\n",
"│ 3 ┆ 6 │\n",
"└─────────┴──────────┘\n",
"2 AlteryxSelect\n",
"3 BrowseV2\n",
"4 TextInput\n",
"shape: (3, 4)\n",
"┌──────────┬──────────┬─────┬────────┐\n",
"│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n",
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str ┆ str │\n",
"╞══════════╪══════════╪═════╪════════╡\n",
"│ 5 ┆ 8 ┆ z ┆ a │\n",
"│ 6 ┆ 9 ┆ b ┆ d │\n",
"│ 7 ┆ 10 ┆ c ┆ c │\n",
"└──────────┴──────────┴─────┴────────┘\n",
"5 AlteryxSelect\n",
"6 Formula\n",
"7 Join\n",
"10 BrowseV2\n",
"11 Formula\n",
"12 Sample\n",
"13 BrowseV2\n",
"14 DbFileOutput\n",
"15 TextInput\n",
"shape: (3, 3)\n",
"┌─────┬────────┬───────┐\n",
"│ Ham ┆ Cheese ┆ Count │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str │\n",
"╞═════╪════════╪═══════╡\n",
"│ a ┆ a ┆ 4 │\n",
"│ b ┆ d ┆ 6 │\n",
"│ c ┆ z ┆ 7 │\n",
"└─────┴────────┴───────┘\n"
]
}
],
"source": [
"# Parse all tools in tools dict\n",
"for tool in results:\n",
"\n",
" ToolID = tool[0]\n",
" ToolXML = tool[1]\n",
" ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n",
"\n",
" print(ToolID, ToolType)\n",
"\n",
" if ToolType == 'TextInput':\n",
" print(input_textInput(ToolXML))"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"############### Input dataframe (TextInput):\n",
" shape: (3, 4)\n",
"┌──────────┬──────────┬─────┬────────┐\n",
"│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n",
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str ┆ str │\n",
"╞══════════╪══════════╪═════╪════════╡\n",
"│ 5 ┆ 8 ┆ z ┆ a │\n",
"│ 6 ┆ 9 ┆ b ┆ d │\n",
"│ 7 ┆ 10 ┆ c ┆ c │\n",
"└──────────┴──────────┴─────┴────────┘\n",
"\n",
"############### Generated code from Select tool: \n",
" df_output = df.with_columns(\n",
"df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n",
")\n",
"df_output = df_output.drop(f'Column 3')\n",
"df_output = df_output.drop(f'Column 4')\n",
"\n",
"\n",
"############### Output DF: \n",
"\n",
" shape: (3, 3)\n",
"┌─────┬────────┬───────────────┐\n",
"│ Ham ┆ Cheese ┆ Col_3_renamed │\n",
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str │\n",
"╞═════╪════════╪═══════════════╡\n",
"│ z ┆ a ┆ 5 │\n",
"│ b ┆ d ┆ 6 │\n",
"│ c ┆ c ┆ 7 │\n",
"└─────┴────────┴───────────────┘\n"
]
}
],
"source": [
"# Tool 1: TextInput\n",
"xml_tool1 = results[3][1]\n",
"df_in = input_textInput(xml_tool1)\n",
"print(\"\\n############### Input dataframe (TextInput):\\n\", df_in)\n",
"# Tool 2: Select\n",
"col_spec = getConf_Select(results[4][1])\n",
"# Generate code from \n",
"code = tool_select(col_spec)\n",
"print(\"\\n############### Generated code from Select tool: \\n\", code)\n",
"# Execute and display output\n",
"df = df_in\n",
"exec(code)\n",
"print(\"\\n############### Output DF: \\n\\n\", df_output)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_93f5c2999bcb8c478c002b15166e2b45~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n",
"('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_ab9b90f8fc991440905d667f1d5c7325~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n",
"('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n",
"('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_dd72ded80941104b9b9be56761379cb2~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n",
"('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n False\\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n ')\n",
"('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n"
]
}
],
"source": [
"for tool in results:\n",
" print(tool)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "polaryx",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}