From 7e59f9159f193b8052cdf24efece805127c216bb Mon Sep 17 00:00:00 2001 From: Casey Date: Wed, 14 Aug 2024 21:50:20 +1000 Subject: [PATCH] need to add multi-column join criteria --- scratchpad.ipynb | 176 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 124 insertions(+), 52 deletions(-) diff --git a/scratchpad.ipynb b/scratchpad.ipynb index bd480cb..7e19a8f 100644 --- a/scratchpad.ipynb +++ b/scratchpad.ipynb @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 254, + "execution_count": 329, "metadata": {}, "outputs": [], "source": [ @@ -90,6 +90,100 @@ " return dynamic_code" ] }, + { + "cell_type": "code", + "execution_count": 334, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'joinParams': ('Col_3_renamed', 'Column 3'), 'SelectFields': {'Right_Column 3': ('True', 'Right_Column 3', 'Right_', None), 'Right_Column 4': ('True', 'Right_Column 4', 'Right_', None), '*Unknown': ('True', None, None, None)}}\n", + "None\n" + ] + } + ], + "source": [ + "def tool_join(join_spec: dict, df_L: pl.DataFrame, df_R: pl.DataFrame):\n", + " \n", + "\n", + "\n", + "xml_join_tool = results[6][1]\n", + "join_spec = getConf_Join(xml_join_tool)\n", + "\n", + "print(join_spec)\n", + "\n", + "print(tool_join(join_spec))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 349, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "L in: shape: (3, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ foo ┆ bar ┆ ham │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════╪═════╪═════╡\n", + "│ 1 ┆ 6.0 ┆ a │\n", + "│ 2 ┆ 7.0 ┆ b │\n", + "│ 3 ┆ 8.0 ┆ c │\n", + "└─────┴─────┴─────┘\n", + "R in: shape: (3, 2)\n", + "┌───────┬─────┐\n", + "│ apple ┆ ham │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═══════╪═════╡\n", + "│ x ┆ a │\n", + "│ y ┆ b │\n", + "│ z ┆ d │\n", + "└───────┴─────┘\n", + "L: shape: (1, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ foo ┆ bar ┆ ham │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════╪═════╪═════╡\n", + "│ 3 ┆ 8.0 ┆ c │\n", + "└─────┴─────┴─────┘\n" + ] + } + ], + "source": [ + "df = pl.DataFrame(\n", + " {\n", + " \"foo\": [1, 2, 3],\n", + " \"bar\": [6.0, 7.0, 8.0],\n", + " \"ham\": [\"a\", \"b\", \"c\"],\n", + " }\n", + ")\n", + "\n", + "other_df = pl.DataFrame(\n", + " {\n", + " \"apple\": [\"x\", \"y\", \"z\"],\n", + " \"ham\": [\"a\", \"b\", \"d\"],\n", + " }\n", + ")\n", + "\n", + "print(\"L in:\", df)\n", + "\n", + "print(\"R in:\", other_df)\n", + "\n", + "print(\"L:\", df.join(other_df, left_on=\"ham\", right_on=\"ham\", how='anti'))\n", + "\n", + "# print(\"J: \", df.join(other_df, on=\"ham\", how='inner'))\n", + "\n", + "# print(\"R:\", other_df.join(df, on=\"ham\", how='anti'))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -146,60 +240,16 @@ }, { "cell_type": "code", - "execution_count": 313, + "execution_count": 330, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "joinByRecordPos: False\n", - "{'joinParams': ('Col_3_renamed', 'Column 3')}\n" - ] - } - ], + "outputs": [], "source": [ - "### WIP\n", - "\n", "def getConf_Join(xml_string):\n", - " print(xml_string)\n", + " # print(xml_string)\n", " root = ET.fromstring(xml_string)\n", " dict_JoinTool = {}\n", "\n", - " print(\"joinByRecordPos:\", root.find(\".//Properties/Configuration\").attrib['joinByRecordPos'])\n", + " # print(\"joinByRecordPos:\", root.find(\".//Properties/Configuration\").attrib['joinByRecordPos'])\n", "\n", " # Join parameters\n", " for joinField in root.findall(\".//Configuration/JoinInfo\"):\n", @@ -213,13 +263,35 @@ " else:\n", " dict_JoinTool['joinParams'] = (left_on, right_on)\n", "\n", + " fieldConfig = {}\n", " # Select parameters\n", - " for joinField in root.findall(\".//Configuration/SelectConfiguration\"):\n", + " for joinField in root.findall(\".//SelectField\"):\n", + " field = joinField.attrib['field']\n", + " fieldSelected = joinField.attrib['selected']\n", "\n", + " if field != \"*Unknown\":\n", + " try:\n", + " fieldRename = joinField.attrib['rename']\n", + " except:\n", + " fieldRename = None\n", + " try:\n", + " fieldType = joinField.attrib['type']\n", + " except:\n", + " fieldType = None\n", + "\n", + " fieldInput = joinField.attrib['input']\n", + "\n", + " fieldConfig[field] = (fieldSelected, fieldRename, fieldInput, fieldType)\n", + "\n", + " fieldRename = None\n", + " fieldType = None \n", + " fieldInput = None\n", + "\n", + " dict_JoinTool[\"SelectFields\"] = fieldConfig\n", " return dict_JoinTool\n", "\n", - "xml_join_tool = results[6][1]\n", - "print(getConf_Join(xml_join_tool))\n" + "# xml_join_tool = results[6][1]\n", + "# print(getConf_Join(xml_join_tool))\n" ] }, {