Polaryx/scratchpad.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open Alteryx XML into a string \n",
    "\n",
    "import polars as pl \n",
    "import xml.etree.ElementTree as ET\n",
    "\n",
    "xml_file_path = \"./SimpleWorkflow/SimpleWorkflow.yxmd\"\n",
    "tree = ET.parse(xml_file_path)\n",
    "root = tree.getroot()\n",
    "\n",
    "xml_string = ET.tostring(root, encoding='unicode')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parse out nodes (tool data) into a dict\n",
    "\n",
    "def extract_tool_id_and_contents(xml_string):\n",
    "    root = ET.fromstring(xml_string)\n",
    "    results = []\n",
    "    for node in root.iter('Node'):\n",
    "        tool_id = node.attrib.get('ToolID')\n",
    "        content = ET.tostring(node, encoding='unicode')\n",
    "        results.append((tool_id, content))\n",
    "    return results\n",
    "\n",
    "\n",
    "results = extract_tool_id_and_contents(xml_string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tool Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def selectTool(df: pl.DataFrame, col_specs: dict):\n",
    "    \"\"\"\n",
    "    Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n",
    "\n",
    "    Args:\n",
    "        df (pl.DataFrame): The input Polars DataFrame.\n",
    "        col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n",
    "            and values are tuples containing the new column name and data type.\n",
    "\n",
    "    Returns:\n",
    "        pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n",
    "    \"\"\"\n",
    "    for old_name, (new_name, dt) in col_specs.items():\n",
    "        df = df.rename({old_name: new_name})\n",
    "        if dt is not None:\n",
    "            df = df.with_column(pl.col(old_name).cast(dt))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parsing Tool Config data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def TextInputToDf(xml_string):\n",
    "    # Get XML for a Text input tool\n",
    "    root = ET.fromstring(results[3][1])\n",
    "    # Extract the field names\n",
    "    fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n",
    "    # Extract the data rows\n",
    "    data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n",
    "    # Create the polars dataframe\n",
    "    df = pl.DataFrame(data_rows, fields)\n",
    "    # Display the dataframe\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 TextInput\n",
      "shape: (3, 2)\n",
      "┌──────────┬──────────┐\n",
      "│ Column 3 ┆ Column 4 │\n",
      "│ ---      ┆ ---      │\n",
      "│ i64      ┆ i64      │\n",
      "╞══════════╪══════════╡\n",
      "│ 5        ┆ 8        │\n",
      "│ 6        ┆ 9        │\n",
      "│ 7        ┆ 10       │\n",
      "└──────────┴──────────┘\n",
      "2 AlteryxSelect\n",
      "3 BrowseV2\n",
      "4 TextInput\n",
      "shape: (3, 2)\n",
      "┌──────────┬──────────┐\n",
      "│ Column 3 ┆ Column 4 │\n",
      "│ ---      ┆ ---      │\n",
      "│ i64      ┆ i64      │\n",
      "╞══════════╪══════════╡\n",
      "│ 5        ┆ 8        │\n",
      "│ 6        ┆ 9        │\n",
      "│ 7        ┆ 10       │\n",
      "└──────────┴──────────┘\n",
      "5 AlteryxSelect\n",
      "6 Formula\n",
      "7 Join\n",
      "10 BrowseV2\n",
      "11 Formula\n",
      "12 Sample\n",
      "13 BrowseV2\n",
      "14 DbFileOutput\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\casey.morter\\AppData\\Local\\Temp\\ipykernel_4012\\3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n",
      "  df = pl.DataFrame(data_rows, fields)\n"
     ]
    }
   ],
   "source": [
    "# Parse all tools in tools dict\n",
    "for tool in results:\n",
    "\n",
    "    ToolID = tool[0]\n",
    "    ToolXML = tool[1]\n",
    "    ToolType = ET.fromstring(ToolXML).find(\".//GuiSettings\").attrib['Plugin'].split(\".\")[2]\n",
    "\n",
    "    print(ToolID, ToolType)\n",
    "\n",
    "    if ToolType == 'TextInput':\n",
    "        print(TextInputToDf(ToolXML))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check out a tool\n",
    "# 0 = TextInput\n",
    "# 4 =  select with rename\n",
    "tool_xml = results[4][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'c:\\\\Users\\\\casey.morter\\\\OneDrive - JLL\\\\Documents\\\\01 Workspace\\\\01 Python\\\\Polaryx'"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'type'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[31], line 16\u001b[0m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m field \u001b[38;5;129;01min\u001b[39;00m root\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.//SelectFields/SelectField\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m     15\u001b[0m     field_name \u001b[38;5;241m=\u001b[39m field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfield\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m---> 16\u001b[0m     field_type \u001b[38;5;241m=\u001b[39m \u001b[43mfield\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattrib\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtype\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m     18\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m     19\u001b[0m         field_rename \u001b[38;5;241m=\u001b[39m  field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrename\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
      "\u001b[1;31mKeyError\u001b[0m: 'type'"
     ]
    }
   ],
   "source": [
    "col_specs = {\n",
    "    \"A\": (\"x\", int, False),\n",
    "    \"B\": (\"y\", str, False),\n",
    "    \"D\": (None, None, True)  # drop this column\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "# Parse the XML\n",
    "root = ET.fromstring(tool_xml)\n",
    "\n",
    "dict_SelectTool = {}\n",
    "\n",
    "for field in root.findall(\".//SelectFields/SelectField\"):\n",
    "    field_name = field.attrib['field']\n",
    "\n",
    "    try:\n",
    "        field_type = field.attrib['type']\n",
    "    except:\n",
    "        field_type = None\n",
    "        \n",
    "    try:\n",
    "        field_rename =  field.attrib['rename']\n",
    "    except:\n",
    "        field_rename = None\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "    dict_SelectTool[field_name] = (None, None, field_rename)\n",
    "    \n",
    "\n",
    "    # dict_SelectTool['field']\n",
    "\n",
    "    # if field.attrib['field'] != '*Unknown':\n",
    "    #     field.attrib['selected'], \"Type: \", field.attrib['size']\n",
    "\n",
    "\n",
    "\n",
    "    \n",
    "    # print(field.attrib['field'], \"| Selected:\", field.attrib['selected'], \"Type: \", field.attrib['size'])\n",
    "\n",
    "dict_SelectTool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "gui_settings_text = root.find(\".//GuiSettings\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'AlteryxBasePluginsGui.TextInput.TextInput'"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gui_settings_text.attrib['Plugin']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "\n",
    "def reshape_polars_df(df: pl.DataFrame, col_specs: dict):\n",
    "    \"\"\"\n",
    "    Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n",
    "\n",
    "    Args:\n",
    "        df (pl.DataFrame): The input Polars DataFrame.\n",
    "        col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n",
    "            and values are tuples containing the new column name, data type, and a boolean indicating whether\n",
    "            the column should be dropped or not.\n",
    "\n",
    "    Returns:\n",
    "        pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n",
    "    \"\"\"\n",
    "    for old_name, (new_name, dt, drop) in col_specs.items():\n",
    "        if drop:\n",
    "            df = df.drop(old_name)\n",
    "        else:\n",
    "            df = df.rename({old_name: new_name})\n",
    "            if dt is not None:\n",
    "                df = df.with_column(pl.col(old_name).cast(dt))\n",
    "    return df\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "polaryx",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}