From 71559b78944fdb510abe5c596c7737799c0db022 Mon Sep 17 00:00:00 2001 From: "casey.morter" Date: Tue, 20 Aug 2024 16:11:32 +1000 Subject: [PATCH] Update scratchpad.ipynb --- scratchpad.ipynb | 320 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 221 insertions(+), 99 deletions(-) diff --git a/scratchpad.ipynb b/scratchpad.ipynb index 7e19a8f..5d84840 100644 --- a/scratchpad.ipynb +++ b/scratchpad.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 293, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -26,22 +26,61 @@ }, { "cell_type": "code", - "execution_count": 294, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_93f5c2999bcb8c478c002b15166e2b45~.yxdb\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n", + "('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_ab9b90f8fc991440905d667f1d5c7325~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n", + "('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n", + "('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_dd72ded80941104b9b9be56761379cb2~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n False\\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n ')\n", + "('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n" + ] + } + ], "source": [ "# Parse out nodes (tool data) into a dict\n", "def extract_tool_id_and_contents(xml_string):\n", " root = ET.fromstring(xml_string)\n", " results = []\n", " for node in root.iter('Node'):\n", + " tool_type = node.attrib.get('.//GuiSettings')\n", + " print(tool_type)\n", " tool_id = node.attrib.get('ToolID')\n", " content = ET.tostring(node, encoding='unicode')\n", " results.append((tool_id, content))\n", " return results\n", "\n", "\n", - "results = extract_tool_id_and_contents(xml_string)" + "results = extract_tool_id_and_contents(xml_string)\n", + "\n", + "\n", + "for tool in results:\n", + " print(tool)" ] }, { @@ -53,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 329, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -92,15 +131,15 @@ }, { "cell_type": "code", - "execution_count": 334, + "execution_count": 130, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'joinParams': ('Col_3_renamed', 'Column 3'), 'SelectFields': {'Right_Column 3': ('True', 'Right_Column 3', 'Right_', None), 'Right_Column 4': ('True', 'Right_Column 4', 'Right_', None), '*Unknown': ('True', None, None, None)}}\n", - "None\n" + "ename": "IndentationError", + "evalue": "expected an indented block after function definition on line 1 (2015830356.py, line 5)", + "output_type": "error", + "traceback": [ + "\u001b[1;36m Cell \u001b[1;32mIn[130], line 5\u001b[1;36m\u001b[0m\n\u001b[1;33m xml_join_tool = results[6][1]\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mIndentationError\u001b[0m\u001b[1;31m:\u001b[0m expected an indented block after function definition on line 1\n" ] } ], @@ -119,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 349, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -193,7 +232,7 @@ }, { "cell_type": "code", - "execution_count": 295, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +242,7 @@ " # Extract the field names\n", " fields = [field.attrib['name'] for field in root.findall(\".//Fields/Field\")]\n", " # Extract the data rows\n", - " data_rows = [[int(c.text) for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", + " data_rows = [[c.text for c in row.findall(\"c\")] for row in root.findall(\".//Data/r\")]\n", " # Create the polars dataframe\n", " df = pl.DataFrame(data_rows, fields, orient=\"row\")\n", " # Display the dataframe\n", @@ -212,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 296, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -240,9 +279,70 @@ }, { "cell_type": "code", - "execution_count": 330, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (213487073.py, line 22)", + "output_type": "error", + "traceback": [ + "\u001b[1;36m Cell \u001b[1;32mIn[133], line 22\u001b[1;36m\u001b[0m\n\u001b[1;33m if fields.attrib['selected'] == 'True' and fields.attrib['field'] != :\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "def getConf_Join(xml_string):\n", + " root = ET.fromstring(xml_string)\n", + " dict_JoinTool = {}\n", + "\n", + " # Join parameters\n", + " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", + " if joinField.attrib['connection'] == \"Left\":\n", + " left_on = [field.attrib['field'] for field in joinField]\n", + " if joinField.attrib['connection'] == \"Right\":\n", + " right_on = [field.attrib['field'] for field in joinField]\n", + "\n", + " if left_on == right_on:\n", + " dict_JoinTool['joinParams'] = ('on', left_on)\n", + " else:\n", + " dict_JoinTool['joinParams'] = (left_on, right_on)\n", + "\n", + " \n", + " # Field Parameters\n", + " fieldConfig = {}\n", + " for fields in root.findall(\".//SelectField\"):\n", + " result = []\n", + " if fields.attrib['selected'] == 'True' and fields.attrib['field'] != :\n", + " for field in ['rename', 'input', 'type', 'size']:\n", + " try:\n", + " result.append(fields.attrib[field])\n", + " except KeyError:\n", + " result.append(None)\n", + "\n", + " fieldConfig[fields.attrib['field']] = result\n", + "\n", + "\n", + " dict_JoinTool['fields'] = fieldConfig\n", + "\n", + " return dict_JoinTool\n", + "\n", + "print(getConf_Join(xml_join_tool))" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'joinParams': ([, ], [, ]), 'SelectFields': {'Right_Ham': ('True', 'Right_Ham', 'Right_', None), 'Right_Cheese': ('True', 'Right_Cheese', 'Right_', None), 'Right_Column 3': ('True', 'Right_Column 3', 'Right_', 'V_String'), 'Right_Column 4': ('True', 'Right_Column 4', 'Right_', 'Int64'), '*Unknown': ('True', None, None, None)}}\n" + ] + } + ], "source": [ "def getConf_Join(xml_string):\n", " # print(xml_string)\n", @@ -253,10 +353,22 @@ "\n", " # Join parameters\n", " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", + " \n", + "\n", + " # left_on = [joinField.findall('Field') for c in joinField.attrib['connection'] == \"Left\"] \n", + "\n", " if joinField.attrib['connection'] == \"Left\":\n", - " left_on = joinField.find('Field').attrib['field']\n", + " pass\n", + " left_on = joinField.findall('Field') #.attrib['field']\n", + "\n", + "\n", + "\n", + " \n", + " # for field in left_on:\n", + " # print(field.attrib['field'])\n", + "\n", " elif joinField.attrib['connection'] == \"Right\":\n", - " right_on = joinField.find('Field').attrib['field']\n", + " right_on = joinField.findall('Field') #.attrib['field']\n", "\n", " if left_on == right_on:\n", " dict_JoinTool['joinParams'] = ('on', left_on)\n", @@ -264,6 +376,7 @@ " dict_JoinTool['joinParams'] = (left_on, right_on)\n", "\n", " fieldConfig = {}\n", + "\n", " # Select parameters\n", " for joinField in root.findall(\".//SelectField\"):\n", " field = joinField.attrib['field']\n", @@ -290,8 +403,8 @@ " dict_JoinTool[\"SelectFields\"] = fieldConfig\n", " return dict_JoinTool\n", "\n", - "# xml_join_tool = results[6][1]\n", - "# print(getConf_Join(xml_join_tool))\n" + "xml_join_tool = results[6][1]\n", + "print(getConf_Join(xml_join_tool))\n" ] }, { @@ -303,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 89, "metadata": {}, "outputs": [ { @@ -312,28 +425,28 @@ "text": [ "1 TextInput\n", "shape: (3, 2)\n", - "┌──────────┬──────────┐\n", - "│ Column 3 ┆ Column 4 │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞══════════╪══════════╡\n", - "│ 5 ┆ 8 │\n", - "│ 6 ┆ 9 │\n", - "│ 7 ┆ 10 │\n", - "└──────────┴──────────┘\n", + "┌─────────┬──────────┐\n", + "│ Column1 ┆ Column 2 │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═════════╪══════════╡\n", + "│ 1 ┆ 4 │\n", + "│ 2 ┆ 5 │\n", + "│ 3 ┆ 6 │\n", + "└─────────┴──────────┘\n", "2 AlteryxSelect\n", "3 BrowseV2\n", "4 TextInput\n", - "shape: (3, 2)\n", - "┌──────────┬──────────┐\n", - "│ Column 3 ┆ Column 4 │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞══════════╪══════════╡\n", - "│ 5 ┆ 8 │\n", - "│ 6 ┆ 9 │\n", - "│ 7 ┆ 10 │\n", - "└──────────┴──────────┘\n", + "shape: (3, 4)\n", + "┌──────────┬──────────┬─────┬────────┐\n", + "│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str │\n", + "╞══════════╪══════════╪═════╪════════╡\n", + "│ 5 ┆ 8 ┆ z ┆ a │\n", + "│ 6 ┆ 9 ┆ b ┆ d │\n", + "│ 7 ┆ 10 ┆ c ┆ c │\n", + "└──────────┴──────────┴─────┴────────┘\n", "5 AlteryxSelect\n", "6 Formula\n", "7 Join\n", @@ -341,15 +454,18 @@ "11 Formula\n", "12 Sample\n", "13 BrowseV2\n", - "14 DbFileOutput\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_579015/219306832.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", - " df = pl.DataFrame(data_rows, fields)\n" + "14 DbFileOutput\n", + "15 TextInput\n", + "shape: (3, 3)\n", + "┌─────┬────────┬───────┐\n", + "│ Ham ┆ Cheese ┆ Count │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str │\n", + "╞═════╪════════╪═══════╡\n", + "│ a ┆ a ┆ 4 │\n", + "│ b ┆ d ┆ 6 │\n", + "│ c ┆ z ┆ 7 │\n", + "└─────┴────────┴───────┘\n" ] } ], @@ -369,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 292, + "execution_count": 135, "metadata": {}, "outputs": [ { @@ -378,79 +494,85 @@ "text": [ "\n", "############### Input dataframe (TextInput):\n", - "\n", - " shape: (3, 2)\n", - "┌──────────┬──────────┐\n", - "│ Column 3 ┆ Column 4 │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞══════════╪══════════╡\n", - "│ 5 ┆ 8 │\n", - "│ 6 ┆ 9 │\n", - "│ 7 ┆ 10 │\n", - "└──────────┴──────────┘\n", + " shape: (3, 4)\n", + "┌──────────┬──────────┬─────┬────────┐\n", + "│ Column 3 ┆ Column 4 ┆ Ham ┆ Cheese │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str │\n", + "╞══════════╪══════════╪═════╪════════╡\n", + "│ 5 ┆ 8 ┆ z ┆ a │\n", + "│ 6 ┆ 9 ┆ b ┆ d │\n", + "│ 7 ┆ 10 ┆ c ┆ c │\n", + "└──────────┴──────────┴─────┴────────┘\n", "\n", "############### Generated code from Select tool: \n", - "\n", " df_output = df.with_columns(\n", "df.select(pl.col(f'Column 3').alias('Col_3_renamed')),\n", ")\n", "df_output = df_output.drop(f'Column 3')\n", "df_output = df_output.drop(f'Column 4')\n", "\n", - "shape: (3, 1)\n", - "┌───────────────┐\n", - "│ Col_3_renamed │\n", - "│ --- │\n", - "│ i64 │\n", - "╞═══════════════╡\n", - "│ 5 │\n", - "│ 6 │\n", - "│ 7 │\n", - "└───────────────┘\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_579015/3571569777.py:9: DataOrientationWarning: Row orientation inferred during DataFrame construction. Explicitly specify the orientation by passing `orient=\"row\"` to silence this warning.\n", - " df = pl.DataFrame(data_rows, fields)\n" + "\n", + "############### Output DF: \n", + "\n", + " shape: (3, 3)\n", + "┌─────┬────────┬───────────────┐\n", + "│ Ham ┆ Cheese ┆ Col_3_renamed │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str │\n", + "╞═════╪════════╪═══════════════╡\n", + "│ z ┆ a ┆ 5 │\n", + "│ b ┆ d ┆ 6 │\n", + "│ c ┆ c ┆ 7 │\n", + "└─────┴────────┴───────────────┘\n" ] } ], "source": [ "# Tool 1: TextInput\n", "xml_tool1 = results[3][1]\n", - "df_in = TextInputToDf(ToolXML)\n", - "\n", - "print(\"\\n############### Input dataframe (TextInput):\\n\\n\", df_in)\n", - "\n", + "df_in = input_textInput(xml_tool1)\n", + "print(\"\\n############### Input dataframe (TextInput):\\n\", df_in)\n", "# Tool 2: Select\n", "col_spec = getConf_Select(results[4][1])\n", - "\n", - "# print(\"\\n############### Select tool spec taken from XML:\\n\\n\", col_spec)\n", - "\n", - "\n", "# Generate code from \n", "code = tool_select(col_spec)\n", - "\n", - "print(\"\\n############### Generated code from Select tool: \\n\\n\", code)\n", - "\n", + "print(\"\\n############### Generated code from Select tool: \\n\", code)\n", + "# Execute and display output\n", "df = df_in\n", - "\n", "exec(code)\n", - "\n", - "print(df_output)\n", - "\n" + "print(\"\\n############### Output DF: \\n\\n\", df_output)" ] }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 92, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('1', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 1\\n 4\\n \\n \\n 2\\n 5\\n \\n \\n 3\\n 6\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('2', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('3', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_93f5c2999bcb8c478c002b15166e2b45~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('4', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n 5\\n 8\\n z\\n a\\n \\n \\n 6\\n 9\\n b\\n d\\n \\n \\n 7\\n 10\\n c\\n c\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('5', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('6', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n MoreCount = [Count] + 10\\n\\n \\n \\n \\n \\n \\n ')\n", + "('7', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('10', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_ab9b90f8fc991440905d667f1d5c7325~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('11', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n Path = [Engine.TempFilePath]\\n\\n \\n \\n \\n \\n \\n ')\n", + "('12', '\\n \\n \\n \\n \\n \\n First\\n 1\\n \\n \\n \\n \\n First 1\\n \\n \\n \\n \\n \\n ')\n", + "('13', '\\n \\n \\n \\n \\n \\n C:\\\\Users\\\\Casey\\\\AppData\\\\Local\\\\Temp\\\\Engine_4072_8b6c0740e308d445ab856d90eb0e4ee9_\\\\Engine_4072_dd72ded80941104b9b9be56761379cb2~.yxdb\\n \\n \\n \\n \\n
\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n", + "('14', '\\n \\n \\n \\n \\n \\n .\\\\SimpleWorkflowOut.csv\\n \\n False\\n \\n CRLF\\n ,\\n False\\n True\\n 28591\\n True\\n \\n \\n \\n \\n \\n SimpleWorkflowOut.csv\\n \\n \\n \\n \\n \\n ')\n", + "('15', '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n a\\n a\\n 4\\n \\n \\n b\\n d\\n 6\\n \\n \\n c\\n z\\n 7\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n ')\n" + ] + } + ], + "source": [ + "for tool in results:\n", + " print(tool)" + ] }, { "cell_type": "code",