diff --git a/Alteryx_datatypes.ods b/Alteryx_datatypes.ods new file mode 100644 index 0000000..003728c Binary files /dev/null and b/Alteryx_datatypes.ods differ diff --git a/scratchpad.ipynb b/scratchpad.ipynb index 641976e..bd480cb 100644 --- a/scratchpad.ipynb +++ b/scratchpad.ipynb @@ -9,12 +9,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 293, "metadata": {}, "outputs": [], "source": [ "# Open Alteryx XML into a string \n", - "\n", "import polars as pl \n", "import xml.etree.ElementTree as ET\n", "\n", @@ -27,12 +26,11 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 294, "metadata": {}, "outputs": [], "source": [ "# Parse out nodes (tool data) into a dict\n", - "\n", "def extract_tool_id_and_contents(xml_string):\n", " root = ET.fromstring(xml_string)\n", " results = []\n", @@ -55,27 +53,41 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 254, "metadata": {}, "outputs": [], "source": [ - "def selectTool(df: pl.DataFrame, col_specs: dict):\n", - " \"\"\"\n", - " Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n", + "def tool_select(col_spec: dict):\n", + " \"\"\" Generates select tool code\"\"\"\n", + " dynamic_code = \"df_output = df.with_columns(\\n\"\n", + " dynamic_code_suffix = ''\n", + " for old_name, (new_name, type, selected) in col_spec.items():\n", "\n", - " Args:\n", - " df (pl.DataFrame): The input Polars DataFrame.\n", - " col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n", - " and values are tuples containing the new column name and data type.\n", + " if old_name == '*Unknown':\n", + " break \n", + " \n", + " if new_name:\n", + " alias = f\".alias('{new_name}')\"\n", + " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", + " else:\n", + " alias = ''\n", "\n", - " Returns:\n", - " pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n", - " \"\"\"\n", - " for old_name, (new_name, dt) in col_specs.items():\n", - " df = df.rename({old_name: new_name})\n", - " if dt is not None:\n", - " df = df.with_column(pl.col(old_name).cast(dt))\n", - " return df" + " if type is not None:\n", + " if 'Int' in type:\n", + " cast = f\".cast(pl.{pl.Int64})\"\n", + " elif 'String' in type:\n", + " cast = f\".cast(pl.{pl.String})\"\n", + " else:\n", + " cast = ''\n", + "\n", + " if selected != 'False':\n", + " dynamic_code += f\"df.select(pl.col(f'{old_name}'){cast}{alias}),\\n\"\n", + " else:\n", + " dynamic_code_suffix += f\"df_output = df_output.drop(f'{old_name}')\\n\"\n", + "\n", + " dynamic_code += \")\\n\" + dynamic_code_suffix\n", + " \n", + " return dynamic_code" ] }, { @@ -87,26 +99,139 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 295, "metadata": {}, "outputs": [], "source": [ - "def TextInputToDf(xml_string):\n", + "def input_textInput(xml_string):\n", " # Get XML for a Text input tool\n", - " root = ET.fromstring(results[3][1])\n", + " root = ET.fromstring(xml_string)\n", " # Extract the field names\n", " fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n", " # Extract the data rows\n", " data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", " # Create the polars dataframe\n", - " df = pl.DataFrame(data_rows, fields)\n", + " df = pl.DataFrame(data_rows, fields, orient=\"row\")\n", " # Display the dataframe\n", " return df" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 296, + "metadata": {}, + "outputs": [], + "source": [ + "def getConf_Select(xml_string):\n", + " root = ET.fromstring(xml_string)\n", + " dict_SelectTool = {}\n", + "\n", + " for field in root.findall(\".//SelectFields/SelectField\"):\n", + " field_name = field.attrib['field']\n", + " field_selected = field.attrib['selected']\n", + "\n", + " try:\n", + " field_type = field.attrib['type']\n", + " except:\n", + " field_type = None\n", + " try:\n", + " field_rename = field.attrib['rename']\n", + " except:\n", + " field_rename = None\n", + "\n", + " dict_SelectTool[field_name] = (field_rename, field_type, field_selected)\n", + "\n", + " return dict_SelectTool" + ] + }, + { + "cell_type": "code", + "execution_count": 313, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "joinByRecordPos: False\n", + "{'joinParams': ('Col_3_renamed', 'Column 3')}\n" + ] + } + ], + "source": [ + "### WIP\n", + "\n", + "def getConf_Join(xml_string):\n", + " print(xml_string)\n", + " root = ET.fromstring(xml_string)\n", + " dict_JoinTool = {}\n", + "\n", + " print(\"joinByRecordPos:\", root.find(\".//Properties/Configuration\").attrib['joinByRecordPos'])\n", + "\n", + " # Join parameters\n", + " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", + " if joinField.attrib['connection'] == \"Left\":\n", + " left_on = joinField.find('Field').attrib['field']\n", + " elif joinField.attrib['connection'] == \"Right\":\n", + " right_on = joinField.find('Field').attrib['field']\n", + "\n", + " if left_on == right_on:\n", + " dict_JoinTool['joinParams'] = ('on', left_on)\n", + " else:\n", + " dict_JoinTool['joinParams'] = (left_on, right_on)\n", + "\n", + " # Select parameters\n", + " for joinField in root.findall(\".//Configuration/SelectConfiguration\"):\n", + "\n", + " return dict_JoinTool\n", + "\n", + "xml_join_tool = results[6][1]\n", + "print(getConf_Join(xml_join_tool))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with the XML file" + ] + }, + { + "cell_type": "code", + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -151,7 +276,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\casey.morter\\AppData\\Local\\Temp\\ipykernel_4012\\3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", + "/tmp/ipykernel_579015/219306832.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", " df = pl.DataFrame(data_rows, fields)\n" ] } @@ -167,163 +292,100 @@ " print(ToolID, ToolType)\n", "\n", " if ToolType == 'TextInput':\n", - " print(TextInputToDf(ToolXML))" + " print(input_textInput(ToolXML))" ] }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "# Check out a tool\n", - "# 0 = TextInput\n", - "# 4 = select with rename\n", - "tool_xml = results[4][1]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, + "execution_count": 292, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'c:\\\\Users\\\\casey.morter\\\\OneDrive - JLL\\\\Documents\\\\01 Workspace\\\\01 Python\\\\Polaryx'" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%pwd" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "############### Input dataframe (TextInput):\n", + "\n", + " shape: (3, 2)\n", + "┌──────────┬──────────┐\n", + "│ Column 3 ┆ Column 4 │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞══════════╪══════════╡\n", + "│ 5 ┆ 8 │\n", + "│ 6 ┆ 9 │\n", + "│ 7 ┆ 10 │\n", + "└──────────┴──────────┘\n", + "\n", + "############### Generated code from Select tool: \n", + "\n", + " df_output = df.with_columns(\n", + "df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n", + ")\n", + "df_output = df_output.drop(f'Column 3')\n", + "df_output = df_output.drop(f'Column 4')\n", + "\n", + "shape: (3, 1)\n", + "┌───────────────┐\n", + "│ Col_3_renamed │\n", + "│ --- │\n", + "│ i64 │\n", + "╞═══════════════╡\n", + "│ 5 │\n", + "│ 6 │\n", + "│ 7 │\n", + "└───────────────┘\n" + ] + }, { - "ename": "KeyError", - "evalue": "'type'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[31], line 16\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m field \u001b[38;5;129;01min\u001b[39;00m root\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.//SelectFields/SelectField\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 15\u001b[0m field_name \u001b[38;5;241m=\u001b[39m field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfield\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m---> 16\u001b[0m field_type \u001b[38;5;241m=\u001b[39m \u001b[43mfield\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattrib\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtype\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 19\u001b[0m field_rename \u001b[38;5;241m=\u001b[39m field\u001b[38;5;241m.\u001b[39mattrib[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrename\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", - "\u001b[1;31mKeyError\u001b[0m: 'type'" + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_579015/3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", + " df = pl.DataFrame(data_rows, fields)\n" ] } ], "source": [ - "col_specs = {\n", - " \"A\": (\"x\", int, False),\n", - " \"B\": (\"y\", str, False),\n", - " \"D\": (None, None, True) # drop this column\n", - "}\n", + "# Tool 1: TextInput\n", + "xml_tool1 = results[3][1]\n", + "df_in = TextInputToDf(ToolXML)\n", + "\n", + "print(\"\\n############### Input dataframe (TextInput):\\n\\n\", df_in)\n", + "\n", + "# Tool 2: Select\n", + "col_spec = getConf_Select(results[4][1])\n", + "\n", + "# print(\"\\n############### Select tool spec taken from XML:\\n\\n\", col_spec)\n", "\n", "\n", + "# Generate code from \n", + "code = tool_select(col_spec)\n", "\n", - "# Parse the XML\n", - "root = ET.fromstring(tool_xml)\n", + "print(\"\\n############### Generated code from Select tool: \\n\\n\", code)\n", "\n", - "dict_SelectTool = {}\n", + "df = df_in\n", "\n", - "for field in root.findall(\".//SelectFields/SelectField\"):\n", - " field_name = field.attrib['field']\n", + "exec(code)\n", "\n", - " try:\n", - " field_type = field.attrib['type']\n", - " except:\n", - " field_type = None\n", - " \n", - " try:\n", - " field_rename = field.attrib['rename']\n", - " except:\n", - " field_rename = None\n", - "\n", - "\n", - "\n", - "\n", - " dict_SelectTool[field_name] = (None, None, field_rename)\n", - " \n", - "\n", - " # dict_SelectTool['field']\n", - "\n", - " # if field.attrib['field'] != '*Unknown':\n", - " # field.attrib['selected'], \"Type: \", field.attrib['size']\n", - "\n", - "\n", - "\n", - " \n", - " # print(field.attrib['field'], \"| Selected:\", field.attrib['selected'], \"Type: \", field.attrib['size'])\n", - "\n", - "dict_SelectTool" + "print(df_output)\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 174, "metadata": {}, "outputs": [], - "source": [ - "gui_settings_text = root.find(\".//GuiSettings\")" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'AlteryxBasePluginsGui.TextInput.TextInput'" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gui_settings_text.attrib['Plugin']" - ] + "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import polars as pl\n", - "\n", - "def reshape_polars_df(df: pl.DataFrame, col_specs: dict):\n", - " \"\"\"\n", - " Reshape a Polars DataFrame by renaming and retyping columns according to the provided dictionary.\n", - "\n", - " Args:\n", - " df (pl.DataFrame): The input Polars DataFrame.\n", - " col_specs (dict): A dictionary where keys are column names in the original DataFrame,\n", - " and values are tuples containing the new column name, data type, and a boolean indicating whether\n", - " the column should be dropped or not.\n", - "\n", - " Returns:\n", - " pl.DataFrame: The reshaped Polars DataFrame with renamed and retyped columns.\n", - " \"\"\"\n", - " for old_name, (new_name, dt, drop) in col_specs.items():\n", - " if drop:\n", - " df = df.drop(old_name)\n", - " else:\n", - " df = df.rename({old_name: new_name})\n", - " if dt is not None:\n", - " df = df.with_column(pl.col(old_name).cast(dt))\n", - " return df\n" - ] + "source": [] } ], "metadata": {