413 lines
13 KiB
Plaintext
413 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Setup"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 293,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Open Alteryx XML into a string \n",
|
|
"import polars as pl \n",
|
|
"import xml.etree.ElementTree as ET\n",
|
|
"\n",
|
|
"xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n",
|
|
"tree = ET.parse(xml_file_path)\n",
|
|
"root = tree.getroot()\n",
|
|
"\n",
|
|
"xml_string = ET.tostring(root, encoding='unicode')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 294,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Parse out nodes (tool data) into a dict\n",
|
|
"def extract_tool_id_and_contents(xml_string):\n",
|
|
" root = ET.fromstring(xml_string)\n",
|
|
" results = []\n",
|
|
" for node in root.iter('Node'):\n",
|
|
" tool_id = node.attrib.get('ToolID')\n",
|
|
" content = ET.tostring(node, encoding='unicode')\n",
|
|
" results.append((tool_id, content))\n",
|
|
" return results\n",
|
|
"\n",
|
|
"\n",
|
|
"results = extract_tool_id_and_contents(xml_string)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Tool Functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 254,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def tool_select(col_spec: dict):\n",
|
|
" \"\"\" Generates select tool code\"\"\"\n",
|
|
" dynamic_code = \"df_output = df.with_columns(\\n\"\n",
|
|
" dynamic_code_suffix = ''\n",
|
|
" for old_name, (new_name, type, selected) in col_spec.items():\n",
|
|
"\n",
|
|
" if old_name == '*Unknown':\n",
|
|
" break \n",
|
|
" \n",
|
|
" if new_name:\n",
|
|
" alias = f\".alias('{new_name}')\"\n",
|
|
" dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n",
|
|
" else:\n",
|
|
" alias = ''\n",
|
|
"\n",
|
|
" if type is not None:\n",
|
|
" if 'Int' in type:\n",
|
|
" cast = f\".cast(pl.{pl.Int64})\"\n",
|
|
" elif 'String' in type:\n",
|
|
" cast = f\".cast(pl.{pl.String})\"\n",
|
|
" else:\n",
|
|
" cast = ''\n",
|
|
"\n",
|
|
" if selected != 'False':\n",
|
|
" dynamic_code += f\"df.select(pl.col(f'{old_name}'){cast}{alias}),\\n\"\n",
|
|
" else:\n",
|
|
" dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n",
|
|
"\n",
|
|
" dynamic_code += \")\\n\" + dynamic_code_suffix\n",
|
|
" \n",
|
|
" return dynamic_code"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Parsing Tool Config data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 295,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def input_textInput(xml_string):\n",
|
|
" # Get XML for a Text input tool\n",
|
|
" root = ET.fromstring(xml_string)\n",
|
|
" # Extract the field names\n",
|
|
" fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n",
|
|
" # Extract the data rows\n",
|
|
" data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n",
|
|
" # Create the polars dataframe\n",
|
|
" df = pl.DataFrame(data_rows, fields, orient=\"row\")\n",
|
|
" # Display the dataframe\n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 296,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def getConf_Select(xml_string):\n",
|
|
" root = ET.fromstring(xml_string)\n",
|
|
" dict_SelectTool = {}\n",
|
|
"\n",
|
|
" for field in root.findall(\".//SelectFields/SelectField\"):\n",
|
|
" field_name = field.attrib['field']\n",
|
|
" field_selected = field.attrib['selected']\n",
|
|
"\n",
|
|
" try:\n",
|
|
" field_type = field.attrib['type']\n",
|
|
" except:\n",
|
|
" field_type = None\n",
|
|
" try:\n",
|
|
" field_rename = field.attrib['rename']\n",
|
|
" except:\n",
|
|
" field_rename = None\n",
|
|
"\n",
|
|
" dict_SelectTool[field_name] = (field_rename, field_type, field_selected)\n",
|
|
"\n",
|
|
" return dict_SelectTool"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 313,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<Node ToolID=\"7\">\n",
|
|
" <GuiSettings Plugin=\"AlteryxBasePluginsGui.Join.Join\">\n",
|
|
" <Position x=\"318\" y=\"258\" />\n",
|
|
" </GuiSettings>\n",
|
|
" <Properties>\n",
|
|
" <Configuration joinByRecordPos=\"False\">\n",
|
|
" <JoinInfo connection=\"Left\">\n",
|
|
" <Field field=\"Col_3_renamed\" />\n",
|
|
" </JoinInfo>\n",
|
|
" <JoinInfo connection=\"Right\">\n",
|
|
" <Field field=\"Column 3\" />\n",
|
|
" </JoinInfo>\n",
|
|
" <SelectConfiguration>\n",
|
|
" <Configuration outputConnection=\"Join\">\n",
|
|
" <OrderChanged value=\"False\" />\n",
|
|
" <CommaDecimal value=\"False\" />\n",
|
|
" <SelectFields>\n",
|
|
" <SelectField field=\"Right_Column 3\" selected=\"True\" rename=\"Right_Column 3\" input=\"Right_\" />\n",
|
|
" <SelectField field=\"Right_Column 4\" selected=\"True\" rename=\"Right_Column 4\" input=\"Right_\" />\n",
|
|
" <SelectField field=\"*Unknown\" selected=\"True\" />\n",
|
|
" </SelectFields>\n",
|
|
" </Configuration>\n",
|
|
" </SelectConfiguration>\n",
|
|
" </Configuration>\n",
|
|
" <Annotation DisplayMode=\"0\">\n",
|
|
" <Name />\n",
|
|
" <DefaultAnnotationText />\n",
|
|
" <Left value=\"False\" />\n",
|
|
" </Annotation>\n",
|
|
" </Properties>\n",
|
|
" <EngineSettings EngineDll=\"AlteryxBasePluginsEngine.dll\" EngineDllEntryPoint=\"AlteryxJoin\" />\n",
|
|
" </Node>\n",
|
|
" \n",
|
|
"joinByRecordPos: False\n",
|
|
"{'joinParams': ('Col_3_renamed', 'Column 3')}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"### WIP\n",
|
|
"\n",
|
|
"def getConf_Join(xml_string):\n",
|
|
" print(xml_string)\n",
|
|
" root = ET.fromstring(xml_string)\n",
|
|
" dict_JoinTool = {}\n",
|
|
"\n",
|
|
" print(\"joinByRecordPos:\", root.find(\".//Properties/Configuration\").attrib['joinByRecordPos'])\n",
|
|
"\n",
|
|
" # Join parameters\n",
|
|
" for joinField in root.findall(\".//Configuration/JoinInfo\"):\n",
|
|
" if joinField.attrib['connection'] == \"Left\":\n",
|
|
" left_on = joinField.find('Field').attrib['field']\n",
|
|
" elif joinField.attrib['connection'] == \"Right\":\n",
|
|
" right_on = joinField.find('Field').attrib['field']\n",
|
|
"\n",
|
|
" if left_on == right_on:\n",
|
|
" dict_JoinTool['joinParams'] = ('on', left_on)\n",
|
|
" else:\n",
|
|
" dict_JoinTool['joinParams'] = (left_on, right_on)\n",
|
|
"\n",
|
|
" # Select parameters\n",
|
|
" for joinField in root.findall(\".//Configuration/SelectConfiguration\"):\n",
|
|
"\n",
|
|
" return dict_JoinTool\n",
|
|
"\n",
|
|
"xml_join_tool = results[6][1]\n",
|
|
"print(getConf_Join(xml_join_tool))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Working with the XML file"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1 TextInput\n",
|
|
"shape: (3, 2)\n",
|
|
"┌──────────┬──────────┐\n",
|
|
"│ Column 3 ┆ Column 4 │\n",
|
|
"│ --- ┆ --- │\n",
|
|
"│ i64 ┆ i64 │\n",
|
|
"╞══════════╪══════════╡\n",
|
|
"│ 5 ┆ 8 │\n",
|
|
"│ 6 ┆ 9 │\n",
|
|
"│ 7 ┆ 10 │\n",
|
|
"└──────────┴──────────┘\n",
|
|
"2 AlteryxSelect\n",
|
|
"3 BrowseV2\n",
|
|
"4 TextInput\n",
|
|
"shape: (3, 2)\n",
|
|
"┌──────────┬──────────┐\n",
|
|
"│ Column 3 ┆ Column 4 │\n",
|
|
"│ --- ┆ --- │\n",
|
|
"│ i64 ┆ i64 │\n",
|
|
"╞══════════╪══════════╡\n",
|
|
"│ 5 ┆ 8 │\n",
|
|
"│ 6 ┆ 9 │\n",
|
|
"│ 7 ┆ 10 │\n",
|
|
"└──────────┴──────────┘\n",
|
|
"5 AlteryxSelect\n",
|
|
"6 Formula\n",
|
|
"7 Join\n",
|
|
"10 BrowseV2\n",
|
|
"11 Formula\n",
|
|
"12 Sample\n",
|
|
"13 BrowseV2\n",
|
|
"14 DbFileOutput\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_579015/219306832.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n",
|
|
" df = pl.DataFrame(data_rows, fields)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Parse all tools in tools dict\n",
|
|
"for tool in results:\n",
|
|
"\n",
|
|
" ToolID = tool[0]\n",
|
|
" ToolXML = tool[1]\n",
|
|
" ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n",
|
|
"\n",
|
|
" print(ToolID, ToolType)\n",
|
|
"\n",
|
|
" if ToolType == 'TextInput':\n",
|
|
" print(input_textInput(ToolXML))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 292,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"############### Input dataframe (TextInput):\n",
|
|
"\n",
|
|
" shape: (3, 2)\n",
|
|
"┌──────────┬──────────┐\n",
|
|
"│ Column 3 ┆ Column 4 │\n",
|
|
"│ --- ┆ --- │\n",
|
|
"│ i64 ┆ i64 │\n",
|
|
"╞══════════╪══════════╡\n",
|
|
"│ 5 ┆ 8 │\n",
|
|
"│ 6 ┆ 9 │\n",
|
|
"│ 7 ┆ 10 │\n",
|
|
"└──────────┴──────────┘\n",
|
|
"\n",
|
|
"############### Generated code from Select tool: \n",
|
|
"\n",
|
|
" df_output = df.with_columns(\n",
|
|
"df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n",
|
|
")\n",
|
|
"df_output = df_output.drop(f'Column 3')\n",
|
|
"df_output = df_output.drop(f'Column 4')\n",
|
|
"\n",
|
|
"shape: (3, 1)\n",
|
|
"┌───────────────┐\n",
|
|
"│ Col_3_renamed │\n",
|
|
"│ --- │\n",
|
|
"│ i64 │\n",
|
|
"╞═══════════════╡\n",
|
|
"│ 5 │\n",
|
|
"│ 6 │\n",
|
|
"│ 7 │\n",
|
|
"└───────────────┘\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_579015/3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n",
|
|
" df = pl.DataFrame(data_rows, fields)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Tool 1: TextInput\n",
|
|
"xml_tool1 = results[3][1]\n",
|
|
"df_in = TextInputToDf(ToolXML)\n",
|
|
"\n",
|
|
"print(\"\\n############### Input dataframe (TextInput):\\n\\n\", df_in)\n",
|
|
"\n",
|
|
"# Tool 2: Select\n",
|
|
"col_spec = getConf_Select(results[4][1])\n",
|
|
"\n",
|
|
"# print(\"\\n############### Select tool spec taken from XML:\\n\\n\", col_spec)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Generate code from \n",
|
|
"code = tool_select(col_spec)\n",
|
|
"\n",
|
|
"print(\"\\n############### Generated code from Select tool: \\n\\n\", code)\n",
|
|
"\n",
|
|
"df = df_in\n",
|
|
"\n",
|
|
"exec(code)\n",
|
|
"\n",
|
|
"print(df_output)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 174,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "polaryx",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|