{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "045e0f5b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2684932/3289088492.py:1: DeprecationWarning: \n", "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", "but was not found to be installed on your system.\n", "If this would cause problems for you,\n", "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", " \n", " import pandas as pd\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sys\n", "from sklearn.model_selection import train_test_split\n", "load(\"~/conjecturing/sage/conjecturing.py\")" ] }, { "cell_type": "markdown", "id": "c561be47-a291-4492-baa8-5cc698c879cb", "metadata": {}, "source": [ "Specify output files." ] }, { "cell_type": "code", "execution_count": 2, "id": "4861be5f-d0ae-4a84-b768-b6903560f130", "metadata": {}, "outputs": [], "source": [ "inv_file = open(\"2022_12_07_inv.txt\", \"w\")\n", "prop_file = open(\"2022_12_07_prop.txt\", \"w\")" ] }, { "cell_type": "markdown", "id": "f5db5172-0de7-4bfa-ad8e-6804f0b1798a", "metadata": { "tags": [] }, "source": [ "Specify the number of examples to use for conjecturing and skips." ] }, { "cell_type": "code", "execution_count": 3, "id": "cc76b0a8-7d39-4ce4-aecc-7495a82aaafa", "metadata": {}, "outputs": [], "source": [ "num_train = 100\n", "my_skips = 0.3" ] }, { "cell_type": "markdown", "id": "c229b455-e876-4ce7-a910-c6766f168ecf", "metadata": {}, "source": [ "Read data. " ] }, { "cell_type": "code", "execution_count": 4, "id": "e449e88a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(25808, 63)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YearDSAProgram_Year.xFacility_CountHIE_B_CountWeighted_HIE_DSAHIE_DSA_PenetrationTimeZIPCounty.Code...Tx_R_O1Tx_R_ST1Tx_R_ST2Flagging_ratioTx_R_ST_CM_R_O1M_R_ST1HalfYear.yHIE_DSA_PP_HIE_Adopt
02012ALOB201310200.001350101037...0.05111.010.081700
12012ALOB201310200.001360331013...0.05111.010.081700
22012ALOB201310200.001354011125...0.05111.010.081700
32012ALOB201310200.001364011035...0.05111.010.081700
42012ALOB201310200.001354621063...0.05111.010.081700
\n", "

5 rows × 63 columns

\n", "
" ], "text/plain": [ " Year DSA Program_Year.x Facility_Count HIE_B_Count Weighted_HIE_DSA \\\n", "0 2012 ALOB 2013 102 0 0.0 \n", "1 2012 ALOB 2013 102 0 0.0 \n", "2 2012 ALOB 2013 102 0 0.0 \n", "3 2012 ALOB 2013 102 0 0.0 \n", "4 2012 ALOB 2013 102 0 0.0 \n", "\n", " HIE_DSA_Penetration Time ZIP County.Code ... Tx_R_O1 Tx_R_ST1 \\\n", "0 0 1 35010 1037 ... 0.05 1 \n", "1 0 1 36033 1013 ... 0.05 1 \n", "2 0 1 35401 1125 ... 0.05 1 \n", "3 0 1 36401 1035 ... 0.05 1 \n", "4 0 1 35462 1063 ... 0.05 1 \n", "\n", " Tx_R_ST2 Flagging_ratio Tx_R_ST_C M_R_O1 M_R_ST1 HalfYear.y HIE_DSA_P \\\n", "0 1 1.0 1 0.08 1 7 0 \n", "1 1 1.0 1 0.08 1 7 0 \n", "2 1 1.0 1 0.08 1 7 0 \n", "3 1 1.0 1 0.08 1 7 0 \n", "4 1 1.0 1 0.08 1 7 0 \n", "\n", " P_HIE_Adopt \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "\n", "[5 rows x 63 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_data =pd.read_excel(\"essi.xlsx\",\n", " header=int(0),\n", " sheet_name = \"Sheet1\" \n", " )\n", "print(my_data.shape)\n", "my_data.head()" ] }, { "cell_type": "markdown", "id": "cf280d83-a24c-4458-ab72-66a115e7cf99", "metadata": {}, "source": [ "Check the data types of the columns. For categorical data, make sure the type is integer or objects. Make sure the categories do not contain special characters besides numbers and \"_\"." ] }, { "cell_type": "code", "execution_count": 5, "id": "17f4debf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 25808 entries, 0 to 25807\n", "Data columns (total 63 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Year 25808 non-null int64 \n", " 1 DSA 25808 non-null object \n", " 2 Program_Year.x 25808 non-null int64 \n", " 3 Facility_Count 25808 non-null int64 \n", " 4 HIE_B_Count 25808 non-null int64 \n", " 5 Weighted_HIE_DSA 25808 non-null float64\n", " 6 HIE_DSA_Penetration 25808 non-null int64 \n", " 7 Time 25808 non-null int64 \n", " 8 ZIP 25808 non-null int64 \n", " 9 County.Code 25808 non-null int64 \n", " 10 State.Name 25808 non-null object \n", " 11 County.Name 25808 non-null object \n", " 12 County.FIPS 25808 non-null int64 \n", " 13 Provider_Number 25808 non-null int64 \n", " 14 Network 25808 non-null int64 \n", " 15 Facility_Name 25808 non-null object \n", " 16 Unnamed: 16 0 non-null float64\n", " 17 City 25808 non-null object \n", " 18 STATE 25808 non-null object \n", " 19 Late.Shift 25808 non-null object \n", " 20 Chain.Owned 23214 non-null object \n", " 21 Chain.Organization 25808 non-null object \n", " 22 TOTSTAS 25808 non-null int64 \n", " 23 HD 25808 non-null int64 \n", " 24 PD 25808 non-null int64 \n", " 25 HOMEHD 25808 non-null int64 \n", " 26 N_DP_HGBD 25454 non-null float64\n", " 27 N_DP_HGBD12 23537 non-null float64\n", " 28 PTSURV_C 25808 non-null int64 \n", " 29 County 25808 non-null object \n", " 30 HDKTVPM12_F 19151 non-null float64\n", " 31 VAVF_F 23931 non-null float64\n", " 32 F_Star 0 non-null float64\n", " 33 PT_HOS_T 25798 non-null object \n", " 34 PT_HOS_T1 25808 non-null int64 \n", " 35 PT_HOS_T2 24936 non-null float64\n", " 36 PT_HOS_C 25808 non-null int64 \n", " 37 N_PT_HOS_S 25418 non-null float64\n", " 38 N_PTSURV_SUM 25416 non-null float64\n", " 39 SHR 25808 non-null float64\n", " 40 SMR 25808 non-null float64\n", " 41 PT_TRANS_T 25061 non-null object \n", " 42 PT_TRANS_T1 23396 non-null float64\n", " 43 PT_TRANS_T2 23396 non-null float64\n", " 44 N_PT_TRANS_S 25269 non-null float64\n", " 45 PTSURV_T 25808 non-null object \n", " 46 PTSURV_T1 25808 non-null int64 \n", " 47 PTSURV_T2 24872 non-null float64\n", " 48 R_Year 25808 non-null int64 \n", " 49 HalfYear.x 25808 non-null int64 \n", " 50 N_Tx_Ctr1 25808 non-null int64 \n", " 51 N_R_Ctr1 25808 non-null int64 \n", " 52 N_Center 25808 non-null int64 \n", " 53 Tx_R_O1 25808 non-null float64\n", " 54 Tx_R_ST1 25808 non-null int64 \n", " 55 Tx_R_ST2 25808 non-null int64 \n", " 56 Flagging_ratio 25808 non-null float64\n", " 57 Tx_R_ST_C 25808 non-null int64 \n", " 58 M_R_O1 25808 non-null float64\n", " 59 M_R_ST1 25808 non-null int64 \n", " 60 HalfYear.y 25808 non-null int64 \n", " 61 HIE_DSA_P 25808 non-null int64 \n", " 62 P_HIE_Adopt 25808 non-null int64 \n", "dtypes: float64(19), int64(31), object(13)\n", "memory usage: 12.4+ MB\n" ] } ], "source": [ "my_data.info()" ] }, { "cell_type": "markdown", "id": "622c9753-c59a-4cb8-bfd4-bb03f9c5434b", "metadata": {}, "source": [ "Pandas thinks PTSURV_T and PT_HOS_T are integers/numeric, but they are categorical. Recast them as objects." ] }, { "cell_type": "code", "execution_count": 6, "id": "bc565d07", "metadata": {}, "outputs": [], "source": [ "#\"PTSURV_T\", \"PT_HOS_T\"\n", "my_data = my_data.astype({\"PTSURV_T\": object, \"PT_HOS_T\": object})\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "77108245", "metadata": {}, "outputs": [], "source": [ "my_data.dropna(subset=[\"PTSURV_T\"], inplace=True)\n", "#my_data.dropna().reset_index(drop=True)" ] }, { "cell_type": "markdown", "id": "b5b3c159-cac7-4599-8680-8c39c381da77", "metadata": {}, "source": [ "Identify invariant and categorical columns and the target column. The target should be in one of the lists." ] }, { "cell_type": "code", "execution_count": 8, "id": "820e9888", "metadata": {}, "outputs": [], "source": [ "#invariant_names=[\"Tx_R_O1\", \"N_Tx_Ctr1\", \"N_R_Ctr1\", \"N_Center\", \"M_R_O1\", \"M_R_ST1\", \"N_PT_TRANS_S\", \"PT_TRANS_T1\", \"N_PTSURV_SUM\", \"N_PT_HOS_S\", \"VAVF_F\", \"N_DP_HGBD12\", \"N_DP_HGBD\", \"HOMEHD\", \"PD\", \"HD\", \"TOTSTAS\"]\n", "invariant_names=[\"Weighted_HIE_DSA\", \"Flagging_ratio\", \"Tx_R_O1\", \"N_Tx_Ctr1\"]\n", "\n", "categorical_names=[\"PTSURV_T\", \"PT_HOS_T\"]\n", "target = \"PTSURV_T\"" ] }, { "cell_type": "markdown", "id": "88b339af-22c1-415b-8faa-fbfaa3437f71", "metadata": {}, "source": [ "Generic code starts here. Rename target column. Select columns." ] }, { "cell_type": "code", "execution_count": 9, "id": "5ae99a05-9075-4ce7-9bb7-bb7137c3cb44", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['Year', 'DSA', 'Program_Year.x', 'Facility_Count', 'HIE_B_Count',\n", " 'Weighted_HIE_DSA', 'HIE_DSA_Penetration', 'Time', 'ZIP', 'County.Code',\n", " 'State.Name', 'County.Name', 'County.FIPS', 'Provider_Number',\n", " 'Network', 'Facility_Name', 'Unnamed: 16', 'City', 'STATE',\n", " 'Late.Shift', 'Chain.Owned', 'Chain.Organization', 'TOTSTAS', 'HD',\n", " 'PD', 'HOMEHD', 'N_DP_HGBD', 'N_DP_HGBD12', 'PTSURV_C', 'County',\n", " 'HDKTVPM12_F', 'VAVF_F', 'F_Star', 'PT_HOS_T', 'PT_HOS_T1', 'PT_HOS_T2',\n", " 'PT_HOS_C', 'N_PT_HOS_S', 'N_PTSURV_SUM', 'SHR', 'SMR', 'PT_TRANS_T',\n", " 'PT_TRANS_T1', 'PT_TRANS_T2', 'N_PT_TRANS_S', 'TARGET', 'PTSURV_T1',\n", " 'PTSURV_T2', 'R_Year', 'HalfYear.x', 'N_Tx_Ctr1', 'N_R_Ctr1',\n", " 'N_Center', 'Tx_R_O1', 'Tx_R_ST1', 'Tx_R_ST2', 'Flagging_ratio',\n", " 'Tx_R_ST_C', 'M_R_O1', 'M_R_ST1', 'HalfYear.y', 'HIE_DSA_P',\n", " 'P_HIE_Adopt'],\n", " dtype='object')\n", "['TARGET', 'PT_HOS_T']\n" ] } ], "source": [ "if target in categorical_names:\n", " categorical_names[categorical_names.index(target)] = \"TARGET\"\n", "else:\n", " invariant_names[invariant_names.index(target)] = \"TARGET\"\n", "\n", "my_data = my_data.rename(columns={target: \"TARGET\"})\n", "print(my_data.columns)\n", "my_data = my_data[invariant_names + categorical_names]\n", "print(categorical_names)" ] }, { "cell_type": "markdown", "id": "94bcbfd3-514a-4a5b-bbea-9ffdd72425c1", "metadata": {}, "source": [ "Convert categorical variables to dummies. One dummy for each binary variable and one dummy for each level for variables with more than two levels." ] }, { "cell_type": "code", "execution_count": 10, "id": "daa9681d-dcc4-4600-86c2-8926b2ba2bd1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Weighted_HIE_DSAFlagging_ratioTx_R_O1N_Tx_Ctr1TARGET_As ExpectedTARGET_Better than ExpectedTARGET_Not AvailableTARGET_Worse than ExpectedTARGET_nanPT_HOS_T_As ExpectedPT_HOS_T_Better than ExpectedPT_HOS_T_Not AvailablePT_HOS_T_Worse than ExpectedPT_HOS_T_nan
00.01.00.051571000010000
10.01.00.051571000001000
20.01.00.051570001010000
30.01.00.051571000010000
40.01.00.051571000010000
\n", "
" ], "text/plain": [ " Weighted_HIE_DSA Flagging_ratio Tx_R_O1 N_Tx_Ctr1 TARGET_As Expected \\\n", "0 0.0 1.0 0.05 157 1 \n", "1 0.0 1.0 0.05 157 1 \n", "2 0.0 1.0 0.05 157 0 \n", "3 0.0 1.0 0.05 157 1 \n", "4 0.0 1.0 0.05 157 1 \n", "\n", " TARGET_Better than Expected TARGET_Not Available \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " TARGET_Worse than Expected TARGET_nan PT_HOS_T_As Expected \\\n", "0 0 0 1 \n", "1 0 0 0 \n", "2 1 0 1 \n", "3 0 0 1 \n", "4 0 0 1 \n", "\n", " PT_HOS_T_Better than Expected PT_HOS_T_Not Available \\\n", "0 0 0 \n", "1 1 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " PT_HOS_T_Worse than Expected PT_HOS_T_nan \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "property_names = []\n", "for col in categorical_names:\n", " if col != \"TARGET\":\n", " unique_vals=list(my_data[col].unique()) # if nan is a level\n", " #unique_vals=list(my_data[col].dropna().unique()) # if nan is not a level\n", " if len(unique_vals)==2: # just use one level for binary features\n", " property_names.append(col+\"_\"+str(unique_vals[1]))\n", " elif len(unique_vals) > 2: #one property for each level.\n", " for level in unique_vals:\n", " property_names.append(col+\"_\"+str(level))\n", "\n", "\n", "if \"TARGET\" in categorical_names:\n", " target_property_names = []\n", " unique_vals = list(my_data[\"TARGET\"].unique()) # if nan is a level\n", " #unique_vals = list(my_data[\"TARGET\"].dropna().unique()) # if nan is not a level\n", " if len(unique_vals)==2:\n", " target_property_names.append(\"TARGET_\"+str(unique_vals[1]))\n", " elif len(unique_vals) > 2:\n", " for level in unique_vals:\n", " target_property_names.append(\"TARGET_\"+str(level))\n", " \n", "my_df = pd.get_dummies(my_data, \n", " columns=categorical_names,\n", " dtype=np.uint8,\n", " dummy_na=True, # False is the default. If False, use dropna() above\n", " drop_first=False) # False is the default\n", "\n", "my_df = my_df.rename(lambda col: col.replace('.0', ''), axis='columns')\n", "my_df.head()" ] }, { "cell_type": "markdown", "id": "2c9c7a94-00ee-49df-8b80-90ece844bee9", "metadata": {}, "source": [ "Define class, invariants, properties, and target properties (if applicable)." ] }, { "cell_type": "code", "execution_count": 11, "id": "5aaf4118", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['PT_HOS_T_As Expected', 'PT_HOS_T_Better than Expected', 'PT_HOS_T_Worse than Expected', 'PT_HOS_T_Not Available', 'PT_HOS_T_nan']\n" ] } ], "source": [ "class Example():\n", " def __init__(self, name, mydf):\n", " self.name = name\n", " self.mydf = mydf\n", " \n", "for i in invariant_names:\n", " inv = build_inv(i)\n", " setattr(Example,inv.__name__,inv )\n", "\n", "for i in property_names:\n", " prop = build_prop(i)\n", " setattr(Example, prop.__name__,prop)\n", "\n", "if \"TARGET\" in categorical_names:\n", " for i in target_property_names:\n", " prop = build_prop(i)\n", " setattr(Example, prop.__name__, prop)\n", "else:\n", " target_invariant = invariant_names.index(\"TARGET\")\n", "print(property_names)" ] }, { "cell_type": "markdown", "id": "1a474cff-5048-45c0-8cdb-f28ae59e6215", "metadata": {}, "source": [ "Split into training and testing data." ] }, { "cell_type": "code", "execution_count": 12, "id": "a84753d5-f6ff-44a2-ae83-e084938808ab", "metadata": {}, "outputs": [], "source": [ "if \"TARGET\" in categorical_names:\n", " X_train, X_test = train_test_split(\n", " my_df.index,\n", " stratify=my_data[\"TARGET\"], # stratify on target levels\n", " train_size=num_train,\n", " random_state=12345\n", " )\n", "else:\n", " X_train, X_test = train_test_split(\n", " my_df.index,\n", " train_size=num_train,\n", " random_state=12345\n", " )" ] }, { "cell_type": "markdown", "id": "41f925f4-f870-46bc-9778-7935914574a7", "metadata": {}, "source": [ "Create examples for conjecturing." ] }, { "cell_type": "code", "execution_count": 13, "id": "fee01df8", "metadata": {}, "outputs": [], "source": [ "train_examples = [Example(i, my_df) for i in X_train]\n", "test_examples = [Example(i, my_df) for i in X_test]" ] }, { "cell_type": "markdown", "id": "62bfa521-f831-45f2-86b2-9cd69f2cc4e8", "metadata": {}, "source": [ "Get lists of invariant and property functions." ] }, { "cell_type": "code", "execution_count": 14, "id": "3f252376", "metadata": {}, "outputs": [], "source": [ "invariants =[]\n", "for i in invariant_names:\n", " invariants.append(Example.__dict__[i])\n", "properties=[]\n", "for i in property_names:\n", " properties.append(Example.__dict__[i])\n", "target_properties=[]\n", "if \"TARGET\" in categorical_names:\n", " for i in target_property_names:\n", " target_properties.append(Example.__dict__[i])\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "0367a44b-736d-4388-8e63-8307607f1f8b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Weighted_HIE_DSAFlagging_ratioTx_R_O1N_Tx_Ctr1TARGET_As ExpectedTARGET_Better than ExpectedTARGET_Not AvailableTARGET_Worse than ExpectedTARGET_nanPT_HOS_T_As ExpectedPT_HOS_T_Better than ExpectedPT_HOS_T_Not AvailablePT_HOS_T_Worse than ExpectedPT_HOS_T_nan
00.01.00.0500001571000010000
10.01.00.0500001571000001000
20.01.00.0500001570001010000
30.01.00.0500001571000010000
40.01.00.0500001571000010000
.............................................
258030.01.00.0895061180100010000
258040.01.00.0895061181000010000
258050.01.00.0895061181000010000
258060.01.00.0895061181000010000
258070.01.00.0895061181000010000
\n", "

25808 rows × 14 columns

\n", "
" ], "text/plain": [ " Weighted_HIE_DSA Flagging_ratio Tx_R_O1 N_Tx_Ctr1 \\\n", "0 0.0 1.0 0.050000 157 \n", "1 0.0 1.0 0.050000 157 \n", "2 0.0 1.0 0.050000 157 \n", "3 0.0 1.0 0.050000 157 \n", "4 0.0 1.0 0.050000 157 \n", "... ... ... ... ... \n", "25803 0.0 1.0 0.089506 118 \n", "25804 0.0 1.0 0.089506 118 \n", "25805 0.0 1.0 0.089506 118 \n", "25806 0.0 1.0 0.089506 118 \n", "25807 0.0 1.0 0.089506 118 \n", "\n", " TARGET_As Expected TARGET_Better than Expected TARGET_Not Available \\\n", "0 1 0 0 \n", "1 1 0 0 \n", "2 0 0 0 \n", "3 1 0 0 \n", "4 1 0 0 \n", "... ... ... ... \n", "25803 0 1 0 \n", "25804 1 0 0 \n", "25805 1 0 0 \n", "25806 1 0 0 \n", "25807 1 0 0 \n", "\n", " TARGET_Worse than Expected TARGET_nan PT_HOS_T_As Expected \\\n", "0 0 0 1 \n", "1 0 0 0 \n", "2 1 0 1 \n", "3 0 0 1 \n", "4 0 0 1 \n", "... ... ... ... \n", "25803 0 0 1 \n", "25804 0 0 1 \n", "25805 0 0 1 \n", "25806 0 0 1 \n", "25807 0 0 1 \n", "\n", " PT_HOS_T_Better than Expected PT_HOS_T_Not Available \\\n", "0 0 0 \n", "1 1 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "... ... ... \n", "25803 0 0 \n", "25804 0 0 \n", "25805 0 0 \n", "25806 0 0 \n", "25807 0 0 \n", "\n", " PT_HOS_T_Worse than Expected PT_HOS_T_nan \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "... ... ... \n", "25803 0 0 \n", "25804 0 0 \n", "25805 0 0 \n", "25806 0 0 \n", "25807 0 0 \n", "\n", "[25808 rows x 14 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_df" ] }, { "cell_type": "markdown", "id": "5b513bc3-3f57-4c7b-8825-c3a9d19527b9", "metadata": {}, "source": [ "Invariant conjecturing - upper and lower bounds." ] }, { "cell_type": "code", "execution_count": 16, "id": "24ae1792", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TARGET_As Expected\n", "TARGET_Worse than Expected\n", "TARGET_Better than Expected\n", "TARGET_Not Available\n", "346\n", "346\n" ] } ], "source": [ "#define operators for expression tree to build upper bounsand lower bouns for each class\n", "use_operators = { '-1', '+1', '*2', '/2', '^2', '-()', '1/', \n", " 'sqrt', 'ln', 'log10', 'exp', '10^', 'ceil', \n", " 'floor', 'abs', '+', '*', 'max', 'min', '-', '/', '^'}\n", "\n", "inv_conjectures = []\n", "\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(value)\n", " target_property = Example.__dict__[value]\n", " my_examples = [example for example in train_examples if target_property(example) == True]\n", " for inv in invariants:\n", " sys.stdout.flush()\n", " inv_of_interest = invariants.index(inv)\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips\n", " )\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", " print(len(inv_conjectures))\n", " if len(target_property_names) == 1:\n", " value = target_property_names[0]\n", " print(value + \" False\")\n", " target_property = Example.__dict__[value]\n", " my_examples = [example for example in train_examples if target_property(example) == False]\n", " for inv in invariants:\n", " sys.stdout.flush()\n", " inv_of_interest = invariants.index(inv)\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips\n", " )\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "else: # target is an invariant\n", " my_examples = [example for example in train_examples]\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " target_invariant, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " target_invariant, \n", " operators=use_operators,\n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs \n", "print(len(inv_conjectures)) \n", "\n", "for c in inv_conjectures:\n", " inv_file.write(\"%s\\n\" % c)\n", " inv_file.flush()\n", "inv_file.close()\n" ] }, { "cell_type": "markdown", "id": "2fc6d1d3-75e2-43e5-ad29-5e1669954216", "metadata": {}, "source": [ "Property conjecturing - sufficient conditions for a categorical target values. For a binary target, get sufficient conditions for the positive class and necessary conditions for the negative class." ] }, { "cell_type": "code", "execution_count": 17, "id": "9c8befb8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TARGET_As Expected\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:177: RuntimeWarning: invalid value encountered in scalar multiply\n", " stack.append(op(left, right))\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:132: RuntimeWarning: overflow encountered in exp\n", " stack.append(op(stack.pop()))\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:177: RuntimeWarning: overflow encountered in scalar power\n", " stack.append(op(left, right))\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: invalid value encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(~(Weighted_HIE_DSA_leq_open_bracket_Flagging_ratio_plus_Tx_R_O1_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_minus_1_close_bracket))->(TARGET_AsExpected)\n", "(~(N_Tx_Ctr1_leq_inverse_of_2_times_e_to_the_power_open_bracket_2_times_e_to_the_power_open_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_plus_1_close_bracket))->(TARGET_AsExpected)\n", "(~(Tx_R_O1_leq_inverse_of_sqrtopen_bracket_2_times_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket))->(TARGET_AsExpected)\n", "(~(N_Tx_Ctr1_geq_e_to_the_power_open_bracket_e_to_the_power_open_bracket_e_to_the_power_open_bracket_Weighted_HIE_DSA_to_the_power_open_bracket_inverse_of_4_close_bracket_close_bracket_close_bracket_close_bracket))->(TARGET_AsExpected)\n", "((~(Weighted_HIE_DSA_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_minus_N_Tx_Ctr1_close_bracket_close_bracket))&(Tx_R_O1_geq_open_bracket_inverse_of_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_Flagging_ratio_close_bracket_plus_N_Tx_Ctr1_close_bracket_close_bracket))->(TARGET_AsExpected)\n", "((Tx_R_O1_geq_open_bracket_logopen_bracket_flooropen_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_minus_1_close_bracket_squared)^(Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1))->(TARGET_AsExpected)\n", "((N_Tx_Ctr1_geq_ceilopen_bracket_10_to_the_power_open_bracket_2_times_sqrtopen_bracket_Tx_R_O1_close_bracket_plus_1_close_bracket_close_bracket)^(N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket))->(TARGET_AsExpected)\n", "((N_Tx_Ctr1_geq_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_plus_1_close_bracket_minus_1_close_bracket)^(N_Tx_Ctr1_leq_ceilopen_bracket_e_to_the_power_open_bracket_inverse_of_4_divided_by_Tx_R_O1_squared_close_bracket_close_bracket))->(TARGET_AsExpected)\n", "((Tx_R_O1_geq_10_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_minus_1_close_bracket_minus_1_close_bracket_minus_Flagging_ratio)^(Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1))->(TARGET_AsExpected)\n", "((Tx_R_O1_leq_ceilopen_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket)^(Weighted_HIE_DSA_geq__minus_inverse_of_logopen_bracket_inverse_of_2_times_N_Tx_Ctr1_plus_inverse_of_2_close_bracket_plus_Tx_R_O1))->(TARGET_AsExpected)\n", "((Tx_R_O1_leq_10_to_the_power_open_bracket_Weighted_HIE_DSA_divided_by_10_to_the_power_logopen_bracket_Flagging_ratio_close_bracket_minus_1_close_bracket)^(Tx_R_O1_leq_open_bracket_logopen_bracket_2_times_sqrtopen_bracket_Weighted_HIE_DSA_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_plus_1_close_bracket_squared))->(TARGET_AsExpected)\n", "(~((Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket)|(Weighted_HIE_DSA_geq_open_bracket_inverse_of_2_times_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_inverse_of_2_times_N_Tx_Ctr1_close_bracket)))->(TARGET_AsExpected)\n", "TARGET_Worse than Expected\n", "(~((Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket)|(N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket)))->(TARGET_WorsethanExpected)\n", "((Flagging_ratio_geq__minus_open_bracket_Tx_R_O1_times_Weighted_HIE_DSA_minus_1_close_bracket_to_the_power_N_Tx_Ctr1)&(PT_HOS_T_WorsethanExpected))->(TARGET_WorsethanExpected)\n", "TARGET_Better than Expected\n", "(~(N_Tx_Ctr1_geq__minus_inverse_of_open_bracket_Tx_R_O1_minus_4_times_Weighted_HIE_DSA_close_bracket))->(TARGET_BetterthanExpected)\n", "(~(Tx_R_O1_geq_4_times_open_bracket_Flagging_ratio_minus_1_close_bracket_squared_divided_by_N_Tx_Ctr1))->(TARGET_BetterthanExpected)\n", "((Weighted_HIE_DSA_geq_open_bracket_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_logopen_bracket_10_to_the_power_N_Tx_Ctr1_close_bracket_close_bracket)^(N_Tx_Ctr1_leq_10_to_the_power_flooropen_bracket_e_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_close_bracket))->(TARGET_BetterthanExpected)\n", "((~(Weighted_HIE_DSA_leq_absopen_bracket_logopen_bracket_logopen_bracket_sqrtopen_bracket_Tx_R_O1_close_bracket_close_bracket_squared_close_bracket_close_bracket))&(PT_HOS_T_WorsethanExpected))->(TARGET_BetterthanExpected)\n", "TARGET_Not Available\n", "(PT_HOS_T_NotAvailable)->(TARGET_NotAvailable)\n" ] } ], "source": [ "all_properties = [\"TARGET\"] + properties + inv_conjectures #\"TARGET\" is just a placeholder\n", "prop_conjs = []\n", "conditions = {}\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(value)\n", " all_properties[0] = Example.__dict__[value]\n", " #print(all_properties)\n", " these_prop_conjs = propertyBasedConjecture(objects=train_examples, # edit here 6/27/23\n", " properties = all_properties,\n", " mainProperty=0,\n", " #verbose=True,\n", " #debug=True,\n", " skips=my_skips)\n", " conditions[value] = []\n", " for c in these_prop_conjs: # edit here 6/27/23 just get premises once\n", " conditions[value].append(get_premise(c, myprint=False))\n", " prop_conjs += these_prop_conjs\n", " if len(target_property_names) == 1:\n", " print(value + \" Necessary\")\n", " all_properties[0] = Example.__dict__[value]\n", " these_prop_conjs = propertyBasedConjecture(objects=train_examples, # edit here 6/27/23\n", " properties = all_properties,\n", " mainProperty=0,\n", " sufficient=False,\n", " #verbose=True,\n", " # debug=True,\n", " skips=my_skips)\n", " conditions[\"necessary\"] = []\n", " for c in these_prop_conjs:\n", " conditions[\"necessary\"].append(get_conclusion(c, myprint=False))\n", " prop_conjs += these_prop_conjs # edit here 6/27/23\n", " \n", "for c in prop_conjs:\n", " prop_file.write(\"%s\\n\" % convert_name_back(c.__name__))\n", " prop_file.flush()\n", " \n", "prop_file.close()" ] }, { "cell_type": "code", "execution_count": 18, "id": "50788eb0-61a3-4aad-87f0-3ae3508bb38b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(these_prop_conjs)" ] }, { "cell_type": "markdown", "id": "9e3b80c5-fa02-4d58-bfe2-b44ff0b81666", "metadata": {}, "source": [ "Apply conjectures to train and test data if target is categorical." ] }, { "cell_type": "code", "execution_count": 19, "id": "f3ab0127-5bef-46ca-9f52-f0a27f35e55a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'TARGET_As Expected': [~Weighted_HIE_DSA_leq_open_bracket_Flagging_ratio_plus_Tx_R_O1_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_minus_1_close_bracket, ~N_Tx_Ctr1_leq_inverse_of_2_times_e_to_the_power_open_bracket_2_times_e_to_the_power_open_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_plus_1_close_bracket, ~Tx_R_O1_leq_inverse_of_sqrtopen_bracket_2_times_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket, ~N_Tx_Ctr1_geq_e_to_the_power_open_bracket_e_to_the_power_open_bracket_e_to_the_power_open_bracket_Weighted_HIE_DSA_to_the_power_open_bracket_inverse_of_4_close_bracket_close_bracket_close_bracket_close_bracket, ~Weighted_HIE_DSA_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_minus_N_Tx_Ctr1_close_bracket_close_bracket&Tx_R_O1_geq_open_bracket_inverse_of_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_Flagging_ratio_close_bracket_plus_N_Tx_Ctr1_close_bracket_close_bracket, Tx_R_O1_geq_open_bracket_logopen_bracket_flooropen_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_minus_1_close_bracket_squared^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1, N_Tx_Ctr1_geq_ceilopen_bracket_10_to_the_power_open_bracket_2_times_sqrtopen_bracket_Tx_R_O1_close_bracket_plus_1_close_bracket_close_bracket^N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket, N_Tx_Ctr1_geq_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_plus_1_close_bracket_minus_1_close_bracket^N_Tx_Ctr1_leq_ceilopen_bracket_e_to_the_power_open_bracket_inverse_of_4_divided_by_Tx_R_O1_squared_close_bracket_close_bracket, Tx_R_O1_geq_10_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_minus_1_close_bracket_minus_1_close_bracket_minus_Flagging_ratio^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1, Tx_R_O1_leq_ceilopen_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket^Weighted_HIE_DSA_geq__minus_inverse_of_logopen_bracket_inverse_of_2_times_N_Tx_Ctr1_plus_inverse_of_2_close_bracket_plus_Tx_R_O1, Tx_R_O1_leq_10_to_the_power_open_bracket_Weighted_HIE_DSA_divided_by_10_to_the_power_logopen_bracket_Flagging_ratio_close_bracket_minus_1_close_bracket^Tx_R_O1_leq_open_bracket_logopen_bracket_2_times_sqrtopen_bracket_Weighted_HIE_DSA_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_plus_1_close_bracket_squared, ~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|Weighted_HIE_DSA_geq_open_bracket_inverse_of_2_times_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_inverse_of_2_times_N_Tx_Ctr1_close_bracket)], 'TARGET_Worse than Expected': [~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket), Flagging_ratio_geq__minus_open_bracket_Tx_R_O1_times_Weighted_HIE_DSA_minus_1_close_bracket_to_the_power_N_Tx_Ctr1&PT_HOS_T_WorsethanExpected], 'TARGET_Better than Expected': [~N_Tx_Ctr1_geq__minus_inverse_of_open_bracket_Tx_R_O1_minus_4_times_Weighted_HIE_DSA_close_bracket, ~Tx_R_O1_geq_4_times_open_bracket_Flagging_ratio_minus_1_close_bracket_squared_divided_by_N_Tx_Ctr1, Weighted_HIE_DSA_geq_open_bracket_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_logopen_bracket_10_to_the_power_N_Tx_Ctr1_close_bracket_close_bracket^N_Tx_Ctr1_leq_10_to_the_power_flooropen_bracket_e_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_close_bracket, ~Weighted_HIE_DSA_leq_absopen_bracket_logopen_bracket_logopen_bracket_sqrtopen_bracket_Tx_R_O1_close_bracket_close_bracket_squared_close_bracket_close_bracket&PT_HOS_T_WorsethanExpected], 'TARGET_Not Available': [PT_HOS_T_NotAvailable]}\n" ] } ], "source": [ "X_train_df = my_df.loc[X_train,property_names+invariant_names] # drop target and one level for each binary variable\n", "X_test_df = my_df.loc[X_test,property_names+invariant_names]\n", "y_train_df = my_data.loc[X_train,\"TARGET\"] # get original target, even if it is multiple levels\n", "y_test_df = my_data.loc[X_test, \"TARGET\"]\n", "if \"TARGET\" in categorical_names:\n", " index=0\n", " for value in target_property_names:\n", " index += 1\n", " for i, condition in enumerate(conditions[value]):\n", " X_train_df['conj_' + str(i)] = [condition(example) for example in train_examples]\n", " X_test_df['conj_' + str(i)] = [condition(example) for example in test_examples]\n", " if len(target_property_names) == 1:\n", " index += 1\n", " for i, condition in enumerate(conditions[\"necessary\"]):\n", " X_train_df['conj_' + str(index)] = [condition(example) for example in train_examples]\n", " X_test_df['conj_' + str(index)] = [condition(example) for example in test_examples]\n", " \n", " X_train_df.head()\n", " #y_train_df.head() \n", " print(conditions)" ] }, { "cell_type": "markdown", "id": "b410f6ff-3cff-41f6-aaab-7a60f6f6cbe3", "metadata": {}, "source": [ "Calculate support, precision, and lift." ] }, { "cell_type": "code", "execution_count": 20, "id": "498f947a-8e7c-4492-88f7-802df56d1898", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['TARGET_As Expected',\n", " 'TARGET_Worse than Expected',\n", " 'TARGET_Better than Expected',\n", " 'TARGET_Not Available']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_property_names" ] }, { "cell_type": "code", "execution_count": 21, "id": "f9e29f39-e7e2-46bb-8154-ec731a18109c", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "value: TARGET_As Expected\n", "0 condition: ~Weighted_HIE_DSA_leq_open_bracket_Flagging_ratio_plus_Tx_R_O1_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_minus_1_close_bracket\n", "1 condition: ~N_Tx_Ctr1_leq_inverse_of_2_times_e_to_the_power_open_bracket_2_times_e_to_the_power_open_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_plus_1_close_bracket\n", "2 condition: ~Tx_R_O1_leq_inverse_of_sqrtopen_bracket_2_times_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket\n", "3 condition: ~N_Tx_Ctr1_geq_e_to_the_power_open_bracket_e_to_the_power_open_bracket_e_to_the_power_open_bracket_Weighted_HIE_DSA_to_the_power_open_bracket_inverse_of_4_close_bracket_close_bracket_close_bracket_close_bracket\n", "4 condition: ~Weighted_HIE_DSA_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_minus_N_Tx_Ctr1_close_bracket_close_bracket&Tx_R_O1_geq_open_bracket_inverse_of_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_Flagging_ratio_close_bracket_plus_N_Tx_Ctr1_close_bracket_close_bracket\n", "5 condition: Tx_R_O1_geq_open_bracket_logopen_bracket_flooropen_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_minus_1_close_bracket_squared^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1\n", "6 condition: N_Tx_Ctr1_geq_ceilopen_bracket_10_to_the_power_open_bracket_2_times_sqrtopen_bracket_Tx_R_O1_close_bracket_plus_1_close_bracket_close_bracket^N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7 condition: N_Tx_Ctr1_geq_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_plus_1_close_bracket_minus_1_close_bracket^N_Tx_Ctr1_leq_ceilopen_bracket_e_to_the_power_open_bracket_inverse_of_4_divided_by_Tx_R_O1_squared_close_bracket_close_bracket\n", "8 condition: Tx_R_O1_geq_10_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_minus_1_close_bracket_minus_1_close_bracket_minus_Flagging_ratio^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1\n", "9 condition: Tx_R_O1_leq_ceilopen_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket^Weighted_HIE_DSA_geq__minus_inverse_of_logopen_bracket_inverse_of_2_times_N_Tx_Ctr1_plus_inverse_of_2_close_bracket_plus_Tx_R_O1\n", "10 condition: Tx_R_O1_leq_10_to_the_power_open_bracket_Weighted_HIE_DSA_divided_by_10_to_the_power_logopen_bracket_Flagging_ratio_close_bracket_minus_1_close_bracket^Tx_R_O1_leq_open_bracket_logopen_bracket_2_times_sqrtopen_bracket_Weighted_HIE_DSA_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_plus_1_close_bracket_squared\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "11 condition: ~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|Weighted_HIE_DSA_geq_open_bracket_inverse_of_2_times_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_inverse_of_2_times_N_Tx_Ctr1_close_bracket)\n", "value: TARGET_Worse than Expected\n", "0 condition: ~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket)\n", "1 condition: Flagging_ratio_geq__minus_open_bracket_Tx_R_O1_times_Weighted_HIE_DSA_minus_1_close_bracket_to_the_power_N_Tx_Ctr1&PT_HOS_T_WorsethanExpected\n", "value: TARGET_Better than Expected\n", "0 condition: ~N_Tx_Ctr1_geq__minus_inverse_of_open_bracket_Tx_R_O1_minus_4_times_Weighted_HIE_DSA_close_bracket\n", "1 condition: ~Tx_R_O1_geq_4_times_open_bracket_Flagging_ratio_minus_1_close_bracket_squared_divided_by_N_Tx_Ctr1\n", "2 condition: Weighted_HIE_DSA_geq_open_bracket_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_logopen_bracket_10_to_the_power_N_Tx_Ctr1_close_bracket_close_bracket^N_Tx_Ctr1_leq_10_to_the_power_flooropen_bracket_e_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_close_bracket\n", "3 condition: ~Weighted_HIE_DSA_leq_absopen_bracket_logopen_bracket_logopen_bracket_sqrtopen_bracket_Tx_R_O1_close_bracket_close_bracket_squared_close_bracket_close_bracket&PT_HOS_T_WorsethanExpected\n", "value: TARGET_Not Available\n", "0 condition: PT_HOS_T_NotAvailable\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
supportprecisionlift
029750.8517647058823531.05229309716101
146220.8308091735179581.02640406712479
224610.8529053230394151.05370224636923
326440.8131618759455371.00460211960247
423060.8026886383347790.991663199303691
535340.8053197509903790.994913746862447
620650.8445520581113801.04338239751681
717090.8262141603276771.02072726386198
841420.8242394978271371.01828771253496
950920.8303220738413201.02580229104295
1068400.8137426900584801.00531967302722
1123410.8517727466894491.05230303099103
122580.1007751937984501.33818630277404
136570.1689497716894982.24347145175290
141120.1696428571428572.14730604206232
152200.1727272727272732.18634797009982
165130.1052631578947371.33240042499158
17180.05555555555555560.703211335412222
188580.85198135198135223.5007903398461
\n", "
" ], "text/plain": [ " support precision lift\n", "0 2975 0.851764705882353 1.05229309716101\n", "1 4622 0.830809173517958 1.02640406712479\n", "2 2461 0.852905323039415 1.05370224636923\n", "3 2644 0.813161875945537 1.00460211960247\n", "4 2306 0.802688638334779 0.991663199303691\n", "5 3534 0.805319750990379 0.994913746862447\n", "6 2065 0.844552058111380 1.04338239751681\n", "7 1709 0.826214160327677 1.02072726386198\n", "8 4142 0.824239497827137 1.01828771253496\n", "9 5092 0.830322073841320 1.02580229104295\n", "10 6840 0.813742690058480 1.00531967302722\n", "11 2341 0.851772746689449 1.05230303099103\n", "12 258 0.100775193798450 1.33818630277404\n", "13 657 0.168949771689498 2.24347145175290\n", "14 112 0.169642857142857 2.14730604206232\n", "15 220 0.172727272727273 2.18634797009982\n", "16 513 0.105263157894737 1.33240042499158\n", "17 18 0.0555555555555556 0.703211335412222\n", "18 858 0.851981351981352 23.5007903398461" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "support = []\n", "lift = []\n", "precision = []\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(\"value: {}\".format(value))\n", " my_function = getattr(Example, value)\n", " for i, condition in enumerate(conditions[value]):\n", " print(i, \"condition: {}\".format(condition))\n", " num_true = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == True:\n", " num_true += 1\n", " if my_function(example) == True:\n", " num_hit += 1\n", " if my_function(example) == True:\n", " num_in_class += 1\n", " support.append(num_true)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_true))\n", " lift.append(n(num_hit/num_true)/n(num_in_class/len(test_examples)))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " if len(target_property_names) == 1:\n", " for i, condition in enumerate(conditions[\"necessary\"]):\n", " print(i, \"condition: {}\".format(condition))\n", " num_false = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == False:\n", " num_false += 1\n", " if my_function(example) == False:\n", " num_hit += 1\n", " if my_function(example) == False:\n", " num_in_class += 1\n", " support.append(num_false)\n", " if num_false == 858:\n", " print(condition)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_false))\n", " lift.append(n(num_hit/num_false)/n(num_in_class/len(test_examples)))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " \n", "results_df = pd.DataFrame({'support':support, 'precision':precision, 'lift':lift})\n", " \n", "results_df" ] }, { "cell_type": "code", "execution_count": 22, "id": "e6ebb428-48e9-4aa3-9f45-926488636708", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "value: TARGET_As Expected\n", "0 condition: ~Weighted_HIE_DSA_leq_open_bracket_Flagging_ratio_plus_Tx_R_O1_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_minus_1_close_bracket\n", "1 condition: ~N_Tx_Ctr1_leq_inverse_of_2_times_e_to_the_power_open_bracket_2_times_e_to_the_power_open_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_plus_1_close_bracket\n", "3 condition: ~Tx_R_O1_leq_inverse_of_sqrtopen_bracket_2_times_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket\n", "6 condition: ~N_Tx_Ctr1_geq_e_to_the_power_open_bracket_e_to_the_power_open_bracket_e_to_the_power_open_bracket_Weighted_HIE_DSA_to_the_power_open_bracket_inverse_of_4_close_bracket_close_bracket_close_bracket_close_bracket\n", "10 condition: ~Weighted_HIE_DSA_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_minus_N_Tx_Ctr1_close_bracket_close_bracket&Tx_R_O1_geq_open_bracket_inverse_of_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_Flagging_ratio_close_bracket_plus_N_Tx_Ctr1_close_bracket_close_bracket\n", "15 condition: Tx_R_O1_geq_open_bracket_logopen_bracket_flooropen_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_minus_1_close_bracket_squared^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1\n", "21 condition: N_Tx_Ctr1_geq_ceilopen_bracket_10_to_the_power_open_bracket_2_times_sqrtopen_bracket_Tx_R_O1_close_bracket_plus_1_close_bracket_close_bracket^N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "28 condition: N_Tx_Ctr1_geq_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_plus_1_close_bracket_minus_1_close_bracket^N_Tx_Ctr1_leq_ceilopen_bracket_e_to_the_power_open_bracket_inverse_of_4_divided_by_Tx_R_O1_squared_close_bracket_close_bracket\n", "36 condition: Tx_R_O1_geq_10_to_the_power_open_bracket_10_to_the_power_open_bracket_Weighted_HIE_DSA_minus_1_close_bracket_minus_1_close_bracket_minus_Flagging_ratio^Weighted_HIE_DSA_leq__minus_inverse_of_open_bracket_sqrtopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket_plus_Tx_R_O1\n", "45 condition: Tx_R_O1_leq_ceilopen_bracket_e_to_the_power_Weighted_HIE_DSA_close_bracket_divided_by_open_bracket_logopen_bracket_N_Tx_Ctr1_close_bracket_plus_1_close_bracket^Weighted_HIE_DSA_geq__minus_inverse_of_logopen_bracket_inverse_of_2_times_N_Tx_Ctr1_plus_inverse_of_2_close_bracket_plus_Tx_R_O1\n", "55 condition: Tx_R_O1_leq_10_to_the_power_open_bracket_Weighted_HIE_DSA_divided_by_10_to_the_power_logopen_bracket_Flagging_ratio_close_bracket_minus_1_close_bracket^Tx_R_O1_leq_open_bracket_logopen_bracket_2_times_sqrtopen_bracket_Weighted_HIE_DSA_close_bracket_close_bracket_divided_by_logopen_bracket_10_close_bracket_plus_1_close_bracket_squared\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "66 condition: ~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|Weighted_HIE_DSA_geq_open_bracket_inverse_of_2_times_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_inverse_of_2_times_N_Tx_Ctr1_close_bracket)\n", "value: TARGET_Worse than Expected\n", "66 condition: ~(Tx_R_O1_geq_open_bracket_inverse_of_open_bracket_10_to_the_power_e_to_the_power_Flagging_ratio_minus_logopen_bracket_N_Tx_Ctr1_close_bracket_close_bracket_close_bracket|N_Tx_Ctr1_geq_logopen_bracket_10_to_the_power_open_bracket_10_to_the_power_open_bracket_10_to_the_power_Tx_R_O1_close_bracket_minus_1_close_bracket_close_bracket)\n", "67 condition: Flagging_ratio_geq__minus_open_bracket_Tx_R_O1_times_Weighted_HIE_DSA_minus_1_close_bracket_to_the_power_N_Tx_Ctr1&PT_HOS_T_WorsethanExpected\n", "value: TARGET_Better than Expected\n", "67 condition: ~N_Tx_Ctr1_geq__minus_inverse_of_open_bracket_Tx_R_O1_minus_4_times_Weighted_HIE_DSA_close_bracket\n", "68 condition: ~Tx_R_O1_geq_4_times_open_bracket_Flagging_ratio_minus_1_close_bracket_squared_divided_by_N_Tx_Ctr1\n", "70 condition: Weighted_HIE_DSA_geq_open_bracket_Tx_R_O1_minus_1_close_bracket_to_the_power_ceilopen_bracket_logopen_bracket_10_to_the_power_N_Tx_Ctr1_close_bracket_close_bracket^N_Tx_Ctr1_leq_10_to_the_power_flooropen_bracket_e_to_the_power_open_bracket_inverse_of_2_divided_by_Tx_R_O1_close_bracket_close_bracket\n", "73 condition: ~Weighted_HIE_DSA_leq_absopen_bracket_logopen_bracket_logopen_bracket_sqrtopen_bracket_Tx_R_O1_close_bracket_close_bracket_squared_close_bracket_close_bracket&PT_HOS_T_WorsethanExpected\n", "value: TARGET_Not Available\n", "73 condition: PT_HOS_T_NotAvailable\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
supportprecisionlift
029750.8517647058823531.05229309716101
146220.8308091735179581.02640406712479
224610.8529053230394151.05370224636923
326440.8131618759455371.00460211960247
423060.8026886383347790.991663199303691
535340.8053197509903790.994913746862447
620650.8445520581113801.04338239751681
717090.8262141603276771.02072726386198
841420.8242394978271371.01828771253496
950920.8303220738413201.02580229104295
1068400.8137426900584801.00531967302722
1123410.8517727466894491.05230303099103
122580.1007751937984501.33818630277404
136570.1689497716894982.24347145175290
141120.1696428571428572.14730604206232
152200.1727272727272732.18634797009982
165130.1052631578947371.33240042499158
17180.05555555555555560.703211335412222
188580.85198135198135223.5007903398461
\n", "
" ], "text/plain": [ " support precision lift\n", "0 2975 0.851764705882353 1.05229309716101\n", "1 4622 0.830809173517958 1.02640406712479\n", "2 2461 0.852905323039415 1.05370224636923\n", "3 2644 0.813161875945537 1.00460211960247\n", "4 2306 0.802688638334779 0.991663199303691\n", "5 3534 0.805319750990379 0.994913746862447\n", "6 2065 0.844552058111380 1.04338239751681\n", "7 1709 0.826214160327677 1.02072726386198\n", "8 4142 0.824239497827137 1.01828771253496\n", "9 5092 0.830322073841320 1.02580229104295\n", "10 6840 0.813742690058480 1.00531967302722\n", "11 2341 0.851772746689449 1.05230303099103\n", "12 258 0.100775193798450 1.33818630277404\n", "13 657 0.168949771689498 2.24347145175290\n", "14 112 0.169642857142857 2.14730604206232\n", "15 220 0.172727272727273 2.18634797009982\n", "16 513 0.105263157894737 1.33240042499158\n", "17 18 0.0555555555555556 0.703211335412222\n", "18 858 0.851981351981352 23.5007903398461" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "support = []\n", "lift = []\n", "precision = []\n", "count=0\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(\"value: {}\".format(value))\n", " my_function = getattr(Example, value)\n", " for i, condition in enumerate(conditions[value]):\n", " count = count+i\n", " print(count, \"condition: {}\".format(condition))\n", " num_true = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == True:\n", " num_true += 1\n", " if my_function(example) == True:\n", " num_hit += 1\n", " if my_function(example) == True:\n", " num_in_class += 1\n", " support.append(num_true)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_true))\n", " lift.append(n(num_hit/num_true)/n(num_in_class/len(test_examples)))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " if len(target_property_names) == 1:\n", " for i, condition in enumerate(conditions[\"necessary\"]):\n", " count = count+i\n", " print(count, \"condition: {}\".format(condition))\n", " num_false = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == False:\n", " num_false += 1\n", " if my_function(example) == False:\n", " num_hit += 1\n", " if my_function(example) == False:\n", " num_in_class += 1\n", " support.append(num_false)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_false))\n", " lift.append(n(num_hit/num_false)/n(num_in_class/len(test_examples)))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " \n", "results_df = pd.DataFrame({'support':support, 'precision':precision, 'lift':lift})\n", " \n", "results_df" ] } ], "metadata": { "kernelspec": { "display_name": "SageMath 10.2", "language": "sage", "name": "sagemath" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }