{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# Open Alteryx XML into a string \n", "\n", "import polars as pl \n", "import xml.etree.ElementTree as ET\n", "\n", "xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n", "tree = ET.parse(xml_file_path)\n", "root = tree.getroot()\n", "\n", "xml_string = ET.tostring(root, encoding='unicode')\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Parse out nodes (tool data) into a dict\n", "\n", "def extract_tool_id_and_contents(xml_string):\n", " root = ET.fromstring(xml_string)\n", " results = []\n", " for node in root.iter('Node'):\n", " tool_id = node.attrib.get('ToolID')\n", " content = ET.tostring(node, encoding='unicode')\n", " results.append((tool_id, content))\n", " return results\n", "\n", "\n", "results = extract_tool_id_and_contents(xml_string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tool Functions" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "def selectTool(df: pl.DataFrame, col_specs: dict):\n", " \"\"\"\n", " Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n", "\n", " Args:\n", " df (pl.DataFrame): The input Polars DataFrame.\n", " col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n", " and values are tuples containing the new column name and data type.\n", "\n", " Returns:\n", " pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n", " \"\"\"\n", " for old_name, (new_name, dt) in col_specs.items():\n", " df = df.rename({old_name: new_name})\n", " if dt is not None:\n", " df = df.with_column(pl.col(old_name).cast(dt))\n", " return df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Parsing Tool Config data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def TextInputToDf(xml_string):\n", " # Get XML for a Text input tool\n", " root = ET.fromstring(results[3][1])\n", " # Extract the field names\n", " fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n", " # Extract the data rows\n", " data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", " # Create the polars dataframe\n", " df = pl.DataFrame(data_rows, fields)\n", " # Display the dataframe\n", " return df" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 TextInput\n", "shape: (3, 2)\n", "┌──────────┬──────────┐\n", "│ Column 3 ┆ Column 4 │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞══════════╪══════════╡\n", "│ 5 ┆ 8 │\n", "│ 6 ┆ 9 │\n", "│ 7 ┆ 10 │\n", "└──────────┴──────────┘\n", "2 AlteryxSelect\n", "3 BrowseV2\n", "4 TextInput\n", "shape: (3, 2)\n", "┌──────────┬──────────┐\n", "│ Column 3 ┆ Column 4 │\n", "│ --- ┆ --- │\n", "│ i64 ┆ i64 │\n", "╞══════════╪══════════╡\n", "│ 5 ┆ 8 │\n", "│ 6 ┆ 9 │\n", "│ 7 ┆ 10 │\n", "└──────────┴──────────┘\n", "5 AlteryxSelect\n", "6 Formula\n", "7 Join\n", "10 BrowseV2\n", "11 Formula\n", "12 Sample\n", "13 BrowseV2\n", "14 DbFileOutput\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\casey.morter\\AppData\\Local\\Temp\\ipykernel_4012\\3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", " df = pl.DataFrame(data_rows, fields)\n" ] } ], "source": [ "# Parse all tools in tools dict\n", "for tool in results:\n", "\n", " ToolID = tool[0]\n", " ToolXML = tool[1]\n", " ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n", "\n", " print(ToolID, ToolType)\n", "\n", " if ToolType == 'TextInput':\n", " print(TextInputToDf(ToolXML))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# Check out a tool\n", "# 0 = TextInput\n", "# 4 = select with rename\n", "tool_xml = results[4][1]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'c:\\\\Users\\\\casey.morter\\\\OneDrive - JLL\\\\Documents\\\\01 Workspace\\\\01 Python\\\\Polaryx'" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%pwd" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'type'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[31], line 16\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m field \u001b[38;5;129;01min\u001b[39;00m root\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.//SelectFields/SelectField\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 15\u001b[0m field_name \u001b[38;5;241m=\u001b[39m field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfield\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m---> 16\u001b[0m field_type \u001b[38;5;241m=\u001b[39m \u001b[43mfield\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattrib\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtype\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 19\u001b[0m field_rename \u001b[38;5;241m=\u001b[39m field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrename\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", "\u001b[1;31mKeyError\u001b[0m: 'type'" ] } ], "source": [ "col_specs = {\n", " \"A\": (\"x\", int, False),\n", " \"B\": (\"y\", str, False),\n", " \"D\": (None, None, True) # drop this column\n", "}\n", "\n", "\n", "\n", "# Parse the XML\n", "root = ET.fromstring(tool_xml)\n", "\n", "dict_SelectTool = {}\n", "\n", "for field in root.findall(\".//SelectFields/SelectField\"):\n", " field_name = field.attrib['field']\n", "\n", " try:\n", " field_type = field.attrib['type']\n", " except:\n", " field_type = None\n", " \n", " try:\n", " field_rename = field.attrib['rename']\n", " except:\n", " field_rename = None\n", "\n", "\n", "\n", "\n", " dict_SelectTool[field_name] = (None, None, field_rename)\n", " \n", "\n", " # dict_SelectTool['field']\n", "\n", " # if field.attrib['field'] != '*Unknown':\n", " # field.attrib['selected'], \"Type: \", field.attrib['size']\n", "\n", "\n", "\n", " \n", " # print(field.attrib['field'], \"| Selected:\", field.attrib['selected'], \"Type: \", field.attrib['size'])\n", "\n", "dict_SelectTool" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "gui_settings_text = root.find(\".//GuiSettings\")" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AlteryxBasePluginsGui.TextInput.TextInput'" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gui_settings_text.attrib['Plugin']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "\n", "def reshape_polars_df(df: pl.DataFrame, col_specs: dict):\n", " \"\"\"\n", " Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n", "\n", " Args:\n", " df (pl.DataFrame): The input Polars DataFrame.\n", " col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n", " and values are tuples containing the new column name, data type, and a boolean indicating whether\n", " the column should be dropped or not.\n", "\n", " Returns:\n", " pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n", " \"\"\"\n", " for old_name, (new_name, dt, drop) in col_specs.items():\n", " if drop:\n", " df = df.drop(old_name)\n", " else:\n", " df = df.rename({old_name: new_name})\n", " if dt is not None:\n", " df = df.with_column(pl.col(old_name).cast(dt))\n", " return df\n" ] } ], "metadata": { "kernelspec": { "display_name": "polaryx", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }