{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "acf961fd-2b08-491e-93be-e1404aa10029",
   "metadata": {},
   "source": [
    "# Titanic Example\n",
    "\n",
    "Load libraries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "045e0f5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2684129/3289088492.py:1: DeprecationWarning: \n",
      "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
      "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
      "but was not found to be installed on your system.\n",
      "If this would cause problems for you,\n",
      "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
      "        \n",
      "  import pandas as pd\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sys\n",
    "from sklearn.model_selection import train_test_split\n",
    "load(\"~/conjecturing/sage/conjecturing.py\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c561be47-a291-4492-baa8-5cc698c879cb",
   "metadata": {},
   "source": [
    "Specify output files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4861be5f-d0ae-4a84-b768-b6903560f130",
   "metadata": {},
   "outputs": [],
   "source": [
    "inv_file = open(\"2022_12_07_inv.txt\", \"w\")\n",
    "prop_file = open(\"2022_12_07_prop.txt\", \"w\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5db5172-0de7-4bfa-ad8e-6804f0b1798a",
   "metadata": {
    "tags": []
   },
   "source": [
    "Specify the number of examples to use for conjecturing and skips."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cc76b0a8-7d39-4ce4-aecc-7495a82aaafa",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_train = 10\n",
    "my_skips = 0.3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c229b455-e876-4ce7-a910-c6766f168ecf",
   "metadata": {},
   "source": [
    "Read data.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e449e88a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(891, 11)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived  Pclass  \\\n",
       "PassengerId                     \n",
       "1                   0       3   \n",
       "2                   1       1   \n",
       "3                   1       3   \n",
       "4                   1       1   \n",
       "5                   0       3   \n",
       "\n",
       "                                                          Name     Sex   Age  \\\n",
       "PassengerId                                                                    \n",
       "1                                      Braund, Mr. Owen Harris    male  22.0   \n",
       "2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   \n",
       "3                                       Heikkinen, Miss. Laina  female  26.0   \n",
       "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   \n",
       "5                                     Allen, Mr. William Henry    male  35.0   \n",
       "\n",
       "             SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
       "PassengerId                                                          \n",
       "1                1      0         A/5 21171   7.2500   NaN        S  \n",
       "2                1      0          PC 17599  71.2833   C85        C  \n",
       "3                0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "4                1      0            113803  53.1000  C123        S  \n",
       "5                0      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_data =pd.read_csv(\"train.csv\",\n",
    "                    index_col=int(0),\n",
    "                    header=int(0)\n",
    "                    )\n",
    "print(my_data.shape)\n",
    "my_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf280d83-a24c-4458-ab72-66a115e7cf99",
   "metadata": {},
   "source": [
    "Check the data types of the columns.  For categorical data, make sure the type is integer or objects.  Make sure the categories do not contain special characters besides numbers and \"_\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "17f4debf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 891 entries, 1 to 891\n",
      "Data columns (total 11 columns):\n",
      " #   Column    Non-Null Count  Dtype  \n",
      "---  ------    --------------  -----  \n",
      " 0   Survived  891 non-null    int64  \n",
      " 1   Pclass    891 non-null    int64  \n",
      " 2   Name      891 non-null    object \n",
      " 3   Sex       891 non-null    object \n",
      " 4   Age       714 non-null    float64\n",
      " 5   SibSp     891 non-null    int64  \n",
      " 6   Parch     891 non-null    int64  \n",
      " 7   Ticket    891 non-null    object \n",
      " 8   Fare      891 non-null    float64\n",
      " 9   Cabin     204 non-null    object \n",
      " 10  Embarked  889 non-null    object \n",
      "dtypes: float64(2), int64(4), object(5)\n",
      "memory usage: 83.5+ KB\n"
     ]
    }
   ],
   "source": [
    "my_data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "622c9753-c59a-4cb8-bfd4-bb03f9c5434b",
   "metadata": {},
   "source": [
    "Pandas thinks Survived and Pclass are integers/numeric, but they are categorical.  Recast them as objects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bc565d07",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_data = my_data.astype({\"Survived\": object, \"Pclass\": object})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7c50de95-24d6-4bcc-be94-21b42beb3dd2",
   "metadata": {},
   "source": [
    "Create a new feature which is the first letter of the cabin."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "77108245",
   "metadata": {},
   "outputs": [],
   "source": [
    "my_data[\"cabin_letter\"]=my_data[\"Cabin\"].str[:1]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5b3c159-cac7-4599-8680-8c39c381da77",
   "metadata": {},
   "source": [
    "Identify invariant and categorical columns and the target column.  The target should be in one of the lists."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "820e9888",
   "metadata": {},
   "outputs": [],
   "source": [
    "invariant_names=[\"Age\", \"Fare\", \"SibSp\", \"Parch\"] \n",
    "categorical_names=[\"Survived\", \"Sex\",\"Pclass\", \"cabin_letter\", \"Embarked\"]\n",
    "target = \"Survived\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88b339af-22c1-415b-8faa-fbfaa3437f71",
   "metadata": {},
   "source": [
    "Generic code starts here.  Rename target column.  Select columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5ae99a05-9075-4ce7-9bb7-bb7137c3cb44",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['TARGET', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',\n",
      "       'Fare', 'Cabin', 'Embarked', 'cabin_letter'],\n",
      "      dtype='object')\n",
      "['TARGET', 'Sex', 'Pclass', 'cabin_letter', 'Embarked']\n"
     ]
    }
   ],
   "source": [
    "if target in categorical_names:\n",
    "    categorical_names[categorical_names.index(target)] = \"TARGET\"\n",
    "else:\n",
    "    invariant_names[invariant_names.index(target)] = \"TARGET\"\n",
    "\n",
    "my_data = my_data.rename(columns={target: \"TARGET\"})\n",
    "print(my_data.columns)\n",
    "my_data = my_data[invariant_names + categorical_names]\n",
    "print(categorical_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94bcbfd3-514a-4a5b-bbea-9ffdd72425c1",
   "metadata": {},
   "source": [
    "Convert categorical variables to dummies.  One dummy for each binary variable and one dummy for each level for variables with more than two levels."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "daa9681d-dcc4-4600-86c2-8926b2ba2bd1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fare</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>TARGET_0</th>\n",
       "      <th>TARGET_1</th>\n",
       "      <th>TARGET_nan</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Sex_nan</th>\n",
       "      <th>...</th>\n",
       "      <th>cabin_letter_D</th>\n",
       "      <th>cabin_letter_E</th>\n",
       "      <th>cabin_letter_F</th>\n",
       "      <th>cabin_letter_G</th>\n",
       "      <th>cabin_letter_T</th>\n",
       "      <th>cabin_letter_nan</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>Embarked_nan</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22.0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38.0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>26.0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>35.0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              Age     Fare  SibSp  Parch  TARGET_0  TARGET_1  TARGET_nan  \\\n",
       "PassengerId                                                                \n",
       "1            22.0   7.2500      1      0         1         0           0   \n",
       "2            38.0  71.2833      1      0         0         1           0   \n",
       "3            26.0   7.9250      0      0         0         1           0   \n",
       "4            35.0  53.1000      1      0         0         1           0   \n",
       "5            35.0   8.0500      0      0         1         0           0   \n",
       "\n",
       "             Sex_female  Sex_male  Sex_nan  ...  cabin_letter_D  \\\n",
       "PassengerId                                 ...                   \n",
       "1                     0         1        0  ...               0   \n",
       "2                     1         0        0  ...               0   \n",
       "3                     1         0        0  ...               0   \n",
       "4                     1         0        0  ...               0   \n",
       "5                     0         1        0  ...               0   \n",
       "\n",
       "             cabin_letter_E  cabin_letter_F  cabin_letter_G  cabin_letter_T  \\\n",
       "PassengerId                                                                   \n",
       "1                         0               0               0               0   \n",
       "2                         0               0               0               0   \n",
       "3                         0               0               0               0   \n",
       "4                         0               0               0               0   \n",
       "5                         0               0               0               0   \n",
       "\n",
       "             cabin_letter_nan  Embarked_C  Embarked_Q  Embarked_S  \\\n",
       "PassengerId                                                         \n",
       "1                           1           0           0           1   \n",
       "2                           0           1           0           0   \n",
       "3                           1           0           0           1   \n",
       "4                           0           0           0           1   \n",
       "5                           1           0           0           1   \n",
       "\n",
       "             Embarked_nan  \n",
       "PassengerId                \n",
       "1                       0  \n",
       "2                       0  \n",
       "3                       0  \n",
       "4                       0  \n",
       "5                       0  \n",
       "\n",
       "[5 rows x 27 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "property_names = []\n",
    "for col in categorical_names:\n",
    "    if col != \"TARGET\":\n",
    "        unique_vals=list(my_data[col].unique())  # if nan is a level\n",
    "        #unique_vals=list(my_data[col].dropna().unique())  # if nan is not a level\n",
    "        if len(unique_vals)==2: # just use one level for binary features\n",
    "            property_names.append(col+\"_\"+str(unique_vals[1]))\n",
    "        elif len(unique_vals) > 2: #one property for each level.\n",
    "            for level in unique_vals:\n",
    "                property_names.append(col+\"_\"+str(level))\n",
    "\n",
    "\n",
    "if \"TARGET\" in categorical_names:\n",
    "    target_property_names = []\n",
    "    unique_vals = list(my_data[\"TARGET\"].unique()) # if nan is a level\n",
    "    #unique_vals = list(my_data[\"TARGET\"].dropna().unique()) # if nan is not a level\n",
    "    if len(unique_vals)==2:\n",
    "        target_property_names.append(\"TARGET_\"+str(unique_vals[1]))\n",
    "    elif len(unique_vals) > 2:\n",
    "        for level in unique_vals:\n",
    "            target_property_names.append(\"TARGET_\"+str(level))\n",
    "            \n",
    "my_df = pd.get_dummies(my_data, \n",
    "                       columns=categorical_names,\n",
    "                       dtype=np.uint8,\n",
    "                       dummy_na=True,  # False is the default.  If False, use dropna() above\n",
    "                       drop_first=False) # False is the default\n",
    "\n",
    "my_df = my_df.rename(lambda col: col.replace('.0', ''), axis='columns')\n",
    "my_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c9c7a94-00ee-49df-8b80-90ece844bee9",
   "metadata": {},
   "source": [
    "Define class, invariants, properties, and target properties (if applicable)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5aaf4118",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Sex_female', 'Pclass_3', 'Pclass_1', 'Pclass_2', 'cabin_letter_nan', 'cabin_letter_C', 'cabin_letter_E', 'cabin_letter_G', 'cabin_letter_D', 'cabin_letter_A', 'cabin_letter_B', 'cabin_letter_F', 'cabin_letter_T', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Embarked_nan']\n"
     ]
    }
   ],
   "source": [
    "class Example():\n",
    "    def __init__(self, name, mydf):\n",
    "        self.name = name\n",
    "        self.mydf = mydf\n",
    "        \n",
    "for i in invariant_names:\n",
    "    inv = build_inv(i)\n",
    "    setattr(Example,inv.__name__,inv )\n",
    "\n",
    "for i in property_names:\n",
    "    prop = build_prop(i)\n",
    "    setattr(Example, prop.__name__,prop)\n",
    "\n",
    "if \"TARGET\" in categorical_names:\n",
    "    for i in target_property_names:\n",
    "        prop = build_prop(i)\n",
    "        setattr(Example, prop.__name__, prop)\n",
    "else:\n",
    "    target_invariant = invariant_names.index(\"TARGET\")\n",
    "print(property_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a474cff-5048-45c0-8cdb-f28ae59e6215",
   "metadata": {},
   "source": [
    "Split into training and testing data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a84753d5-f6ff-44a2-ae83-e084938808ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"TARGET\" in categorical_names:\n",
    "    X_train, X_test = train_test_split(\n",
    "        my_df.index,\n",
    "        stratify=my_data[\"TARGET\"],  # stratify on target levels\n",
    "        train_size=num_train,\n",
    "        random_state=12345\n",
    "    )\n",
    "else:\n",
    "    X_train, X_test = train_test_split(\n",
    "        my_df.index,\n",
    "        train_size=num_train,\n",
    "        random_state=12345\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41f925f4-f870-46bc-9778-7935914574a7",
   "metadata": {},
   "source": [
    "Create examples for conjecturing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "fee01df8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_examples = [Example(i, my_df) for i in X_train]\n",
    "test_examples = [Example(i, my_df) for i in X_test]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62bfa521-f831-45f2-86b2-9cd69f2cc4e8",
   "metadata": {},
   "source": [
    "Get lists of invariant and property functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3f252376",
   "metadata": {},
   "outputs": [],
   "source": [
    "invariants =[]\n",
    "for i in invariant_names:\n",
    "    invariants.append(Example.__dict__[i])\n",
    "properties=[]\n",
    "for i in property_names:\n",
    "    properties.append(Example.__dict__[i])\n",
    "target_properties=[]\n",
    "if \"TARGET\" in categorical_names:\n",
    "    for i in target_property_names:\n",
    "        target_properties.append(Example.__dict__[i])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0367a44b-736d-4388-8e63-8307607f1f8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fare</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>TARGET_0</th>\n",
       "      <th>TARGET_1</th>\n",
       "      <th>TARGET_nan</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Sex_nan</th>\n",
       "      <th>...</th>\n",
       "      <th>cabin_letter_D</th>\n",
       "      <th>cabin_letter_E</th>\n",
       "      <th>cabin_letter_F</th>\n",
       "      <th>cabin_letter_G</th>\n",
       "      <th>cabin_letter_T</th>\n",
       "      <th>cabin_letter_nan</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>Embarked_nan</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22.0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38.0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>26.0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>35.0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>27.0</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>19.0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>NaN</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>26.0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>891</th>\n",
       "      <td>32.0</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              Age     Fare  SibSp  Parch  TARGET_0  TARGET_1  TARGET_nan  \\\n",
       "PassengerId                                                                \n",
       "1            22.0   7.2500      1      0         1         0           0   \n",
       "2            38.0  71.2833      1      0         0         1           0   \n",
       "3            26.0   7.9250      0      0         0         1           0   \n",
       "4            35.0  53.1000      1      0         0         1           0   \n",
       "5            35.0   8.0500      0      0         1         0           0   \n",
       "...           ...      ...    ...    ...       ...       ...         ...   \n",
       "887          27.0  13.0000      0      0         1         0           0   \n",
       "888          19.0  30.0000      0      0         0         1           0   \n",
       "889           NaN  23.4500      1      2         1         0           0   \n",
       "890          26.0  30.0000      0      0         0         1           0   \n",
       "891          32.0   7.7500      0      0         1         0           0   \n",
       "\n",
       "             Sex_female  Sex_male  Sex_nan  ...  cabin_letter_D  \\\n",
       "PassengerId                                 ...                   \n",
       "1                     0         1        0  ...               0   \n",
       "2                     1         0        0  ...               0   \n",
       "3                     1         0        0  ...               0   \n",
       "4                     1         0        0  ...               0   \n",
       "5                     0         1        0  ...               0   \n",
       "...                 ...       ...      ...  ...             ...   \n",
       "887                   0         1        0  ...               0   \n",
       "888                   1         0        0  ...               0   \n",
       "889                   1         0        0  ...               0   \n",
       "890                   0         1        0  ...               0   \n",
       "891                   0         1        0  ...               0   \n",
       "\n",
       "             cabin_letter_E  cabin_letter_F  cabin_letter_G  cabin_letter_T  \\\n",
       "PassengerId                                                                   \n",
       "1                         0               0               0               0   \n",
       "2                         0               0               0               0   \n",
       "3                         0               0               0               0   \n",
       "4                         0               0               0               0   \n",
       "5                         0               0               0               0   \n",
       "...                     ...             ...             ...             ...   \n",
       "887                       0               0               0               0   \n",
       "888                       0               0               0               0   \n",
       "889                       0               0               0               0   \n",
       "890                       0               0               0               0   \n",
       "891                       0               0               0               0   \n",
       "\n",
       "             cabin_letter_nan  Embarked_C  Embarked_Q  Embarked_S  \\\n",
       "PassengerId                                                         \n",
       "1                           1           0           0           1   \n",
       "2                           0           1           0           0   \n",
       "3                           1           0           0           1   \n",
       "4                           0           0           0           1   \n",
       "5                           1           0           0           1   \n",
       "...                       ...         ...         ...         ...   \n",
       "887                         1           0           0           1   \n",
       "888                         0           0           0           1   \n",
       "889                         1           0           0           1   \n",
       "890                         0           1           0           0   \n",
       "891                         1           0           1           0   \n",
       "\n",
       "             Embarked_nan  \n",
       "PassengerId                \n",
       "1                       0  \n",
       "2                       0  \n",
       "3                       0  \n",
       "4                       0  \n",
       "5                       0  \n",
       "...                   ...  \n",
       "887                     0  \n",
       "888                     0  \n",
       "889                     0  \n",
       "890                     0  \n",
       "891                     0  \n",
       "\n",
       "[891 rows x 27 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b513bc3-3f57-4c7b-8825-c3a9d19527b9",
   "metadata": {},
   "source": [
    "Invariant conjecturing - upper and lower bounds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "24ae1792",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TARGET_1\n",
      "28\n",
      "TARGET_1 False\n",
      "41\n"
     ]
    }
   ],
   "source": [
    "#define operators for expression tree to build upper bounsand lower bouns for each class\n",
    "use_operators =  { '-1', '+1', '*2', '/2', '^2', '-()', '1/', \n",
    "                  'sqrt', 'ln', 'log10', 'exp', '10^', 'ceil', \n",
    "                  'floor', 'abs', '+', '*', 'max', 'min', '-', '/', '^'}\n",
    "\n",
    "inv_conjectures = []\n",
    "\n",
    "if \"TARGET\" in categorical_names:\n",
    "    for value in target_property_names:\n",
    "        print(value)\n",
    "        target_property = Example.__dict__[value]\n",
    "        my_examples = [example for example in train_examples if target_property(example) == True]\n",
    "        for inv in invariants:\n",
    "            sys.stdout.flush()\n",
    "            inv_of_interest = invariants.index(inv)\n",
    "            conjs = conjecture(my_examples, \n",
    "                               invariants, \n",
    "                               inv_of_interest, \n",
    "                               operators=use_operators, \n",
    "                               upperBound=True, \n",
    "                               time=Integer(5)\n",
    "                             # ,debug=True,\n",
    "                             #  verbose=True,\n",
    "                               ,skips=my_skips\n",
    "                              )\n",
    "            convert_conjecture_names(conjs)\n",
    "            inv_conjectures += conjs\n",
    "\n",
    "            conjs = conjecture(my_examples, \n",
    "                               invariants, \n",
    "                               inv_of_interest, \n",
    "                               operators=use_operators, \n",
    "                               upperBound=False, \n",
    "                               time=Integer(5)\n",
    "                              ,skips=my_skips)\n",
    "            convert_conjecture_names(conjs)\n",
    "            inv_conjectures += conjs\n",
    "    print(len(inv_conjectures))\n",
    "    if len(target_property_names) == 1:\n",
    "        value = target_property_names[0]\n",
    "        print(value + \" False\")\n",
    "        target_property = Example.__dict__[value]\n",
    "        my_examples = [example for example in train_examples if target_property(example) == False]\n",
    "        for inv in invariants:\n",
    "            sys.stdout.flush()\n",
    "            inv_of_interest = invariants.index(inv)\n",
    "            conjs = conjecture(my_examples, \n",
    "                               invariants, \n",
    "                               inv_of_interest, \n",
    "                               operators=use_operators, \n",
    "                               upperBound=True, \n",
    "                               time=Integer(5)\n",
    "                             # ,debug=True,\n",
    "                             #  verbose=True,\n",
    "                               ,skips=my_skips\n",
    "                              )\n",
    "            convert_conjecture_names(conjs)\n",
    "            inv_conjectures += conjs\n",
    "\n",
    "            conjs = conjecture(my_examples, \n",
    "                               invariants, \n",
    "                               inv_of_interest, \n",
    "                               operators=use_operators, \n",
    "                               upperBound=False, \n",
    "                               time=Integer(5)\n",
    "                              ,skips=my_skips)\n",
    "            convert_conjecture_names(conjs)\n",
    "            inv_conjectures += conjs\n",
    "else: # target is an invariant\n",
    "    my_examples = [example for example in train_examples]\n",
    "    conjs = conjecture(my_examples, \n",
    "                       invariants, \n",
    "                       target_invariant, \n",
    "                       operators=use_operators, \n",
    "                       upperBound=True, \n",
    "                       time=Integer(5)\n",
    "                        # ,debug=True,\n",
    "                        #  verbose=True,\n",
    "                        ,skips=my_skips)\n",
    "    convert_conjecture_names(conjs)\n",
    "    inv_conjectures += conjs\n",
    "    conjs = conjecture(my_examples, \n",
    "                       invariants, \n",
    "                       target_invariant, \n",
    "                       operators=use_operators,\n",
    "                       upperBound=False, \n",
    "                       time=Integer(5)\n",
    "                       ,skips=my_skips)\n",
    "    convert_conjecture_names(conjs)\n",
    "    inv_conjectures += conjs     \n",
    "print(len(inv_conjectures))  \n",
    "\n",
    "for c in inv_conjectures:\n",
    "    inv_file.write(\"%s\\n\" % c)\n",
    "    inv_file.flush()\n",
    "inv_file.close()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fc6d1d3-75e2-43e5-ad29-5e1669954216",
   "metadata": {},
   "source": [
    "Property conjecturing - sufficient conditions for a categorical target values.  For a binary target, get sufficient conditions for the positive class and necessary conditions for the negative class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9c8befb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TARGET_1\n",
      "(Sex_female)->(TARGET_1)\n",
      "(Pclass_2)->(TARGET_1)\n",
      "(cabin_letter_C)->(TARGET_1)\n",
      "(~(Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket))->(TARGET_1)\n",
      "TARGET_1 Necessary\n",
      "(TARGET_1)->(((Pclass_3)^(Sex_female))^(Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket))\n",
      "(TARGET_1)->(Fare_geq_inverse_of_4_times_Age_plus_7_divided_by_4)\n"
     ]
    }
   ],
   "source": [
    "all_properties = [\"TARGET\"] + properties + inv_conjectures #\"TARGET\" is just a placeholder\n",
    "prop_conjs = []\n",
    "conditions = {}\n",
    "if \"TARGET\" in categorical_names:\n",
    "    for value in target_property_names:\n",
    "        print(value)\n",
    "        all_properties[0] = Example.__dict__[value]\n",
    "        #print(all_properties)\n",
    "        these_prop_conjs = propertyBasedConjecture(objects=train_examples, # edit here 6/27/23\n",
    "                                           properties = all_properties,\n",
    "                                           mainProperty=0,\n",
    "                                           #verbose=True,\n",
    "                                           #debug=True,\n",
    "                                           skips=my_skips)\n",
    "        conditions[value] = []\n",
    "        for c in these_prop_conjs: # edit here 6/27/23 just get premises once\n",
    "            conditions[value].append(get_premise(c, myprint=False))\n",
    "        prop_conjs += these_prop_conjs\n",
    "    if len(target_property_names) == 1:\n",
    "        print(value + \" Necessary\")\n",
    "        all_properties[0] = Example.__dict__[value]\n",
    "        these_prop_conjs = propertyBasedConjecture(objects=train_examples,  # edit here 6/27/23\n",
    "                                           properties = all_properties,\n",
    "                                           mainProperty=0,\n",
    "                                           sufficient=False,\n",
    "                                           #verbose=True,\n",
    "                                            #  debug=True,\n",
    "                                             skips=my_skips)\n",
    "        conditions[\"necessary\"] = []\n",
    "        for c in these_prop_conjs:\n",
    "            conditions[\"necessary\"].append(get_conclusion(c, myprint=False))\n",
    "        prop_conjs += these_prop_conjs  # edit here 6/27/23\n",
    "        \n",
    "for c in prop_conjs:\n",
    "    prop_file.write(\"%s\\n\" % convert_name_back(c.__name__))\n",
    "    prop_file.flush()\n",
    "    \n",
    "prop_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "50788eb0-61a3-4aad-87f0-3ae3508bb38b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(these_prop_conjs)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e3b80c5-fa02-4d58-bfe2-b44ff0b81666",
   "metadata": {},
   "source": [
    "Apply conjectures to train and test data if target is categorical."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f3ab0127-5bef-46ca-9f52-f0a27f35e55a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n",
      "  return (lambda x: 10**x), 1\n",
      "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n",
      "  return (lambda x: 10**x), 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'TARGET_1': [Sex_female, Pclass_2, cabin_letter_C, ~Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket], 'necessary': [(Pclass_3^Sex_female)^Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket, Fare_geq_inverse_of_4_times_Age_plus_7_divided_by_4]}\n"
     ]
    }
   ],
   "source": [
    "X_train_df = my_df.loc[X_train,property_names+invariant_names]  # drop target and one level for each binary variable\n",
    "X_test_df = my_df.loc[X_test,property_names+invariant_names]\n",
    "y_train_df = my_data.loc[X_train,\"TARGET\"] # get original target, even if it is multiple levels\n",
    "y_test_df = my_data.loc[X_test, \"TARGET\"]\n",
    "if \"TARGET\" in categorical_names:\n",
    "    index=0\n",
    "    for value in target_property_names:\n",
    "        index += 1\n",
    "        for i, condition in enumerate(conditions[value]):\n",
    "            X_train_df['conj_' + str(i)] = [condition(example) for example in train_examples]\n",
    "            X_test_df['conj_' + str(i)] = [condition(example) for example in test_examples]\n",
    "    if len(target_property_names) == 1:\n",
    "        index += 1\n",
    "        for i, condition in enumerate(conditions[\"necessary\"]):\n",
    "            X_train_df['conj_' + str(index)] = [condition(example) for example in train_examples]\n",
    "            X_test_df['conj_' + str(index)] = [condition(example) for example in test_examples]\n",
    "        \n",
    "    X_train_df.head()\n",
    "    #y_train_df.head()    \n",
    "    print(conditions)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b410f6ff-3cff-41f6-aaab-7a60f6f6cbe3",
   "metadata": {},
   "source": [
    "Calculate support, precision, and lift."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "498f947a-8e7c-4492-88f7-802df56d1898",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TARGET_1']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_property_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f9e29f39-e7e2-46bb-8154-ec731a18109c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "value: TARGET_1\n",
      "condition: Sex_female\n",
      "condition: Pclass_2\n",
      "condition: cabin_letter_C\n",
      "condition: ~Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lustre/home/clarson/conjecturing/sage/conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n",
      "  return (lambda x: 10**x), 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "condition: (Pclass_3^Sex_female)^Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket\n",
      "condition: Fare_geq_inverse_of_4_times_Age_plus_7_divided_by_4\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>support</th>\n",
       "      <th>precision</th>\n",
       "      <th>lift</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>313</td>\n",
       "      <td>0.741214057507987</td>\n",
       "      <td>1.93198101971756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>183</td>\n",
       "      <td>0.469945355191257</td>\n",
       "      <td>1.22491673941863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>58</td>\n",
       "      <td>0.586206896551724</td>\n",
       "      <td>1.52795347888186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>135</td>\n",
       "      <td>0.600000000000000</td>\n",
       "      <td>1.56390532544379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>506</td>\n",
       "      <td>0.656126482213439</td>\n",
       "      <td>1.06454407151020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>318</td>\n",
       "      <td>0.764150943396226</td>\n",
       "      <td>1.23981027832795</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  support          precision              lift\n",
       "0     313  0.741214057507987  1.93198101971756\n",
       "1     183  0.469945355191257  1.22491673941863\n",
       "2      58  0.586206896551724  1.52795347888186\n",
       "3     135  0.600000000000000  1.56390532544379\n",
       "4     506  0.656126482213439  1.06454407151020\n",
       "5     318  0.764150943396226  1.23981027832795"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "support = []\n",
    "lift = []\n",
    "precision = []\n",
    "if \"TARGET\" in categorical_names: \n",
    "    for value in target_property_names:\n",
    "        print(\"value: {}\".format(value))\n",
    "        my_function = getattr(Example, value)\n",
    "        for i, condition in enumerate(conditions[value]):        \n",
    "            print(\"condition: {}\".format(condition))\n",
    "            num_true = 0\n",
    "            num_in_class = 0\n",
    "            num_hit = 0\n",
    "            for example in test_examples:\n",
    "                if condition(example) == True:\n",
    "                    num_true += 1\n",
    "                    if my_function(example) == True:\n",
    "                        num_hit += 1\n",
    "                if my_function(example) == True:\n",
    "                    num_in_class += 1\n",
    "            support.append(num_true)\n",
    "            if num_hit > 0: \n",
    "                precision.append(n(num_hit/num_true))\n",
    "                lift.append(n(num_hit/num_true)/n(num_in_class/len(test_examples)))\n",
    "            else:\n",
    "                precision.append(0.0)\n",
    "                lift.append(0.0)\n",
    "    if len(target_property_names) == 1:\n",
    "        for i, condition in enumerate(conditions[\"necessary\"]):          \n",
    "            print(\"condition: {}\".format(condition))\n",
    "            num_false = 0\n",
    "            num_in_class = 0\n",
    "            num_hit = 0\n",
    "            for example in test_examples:\n",
    "                if condition(example) == False:\n",
    "                    num_false += 1\n",
    "                    if my_function(example) == False:\n",
    "                        num_hit += 1\n",
    "                if my_function(example) == False:\n",
    "                    num_in_class += 1\n",
    "            support.append(num_false)\n",
    "            if num_hit > 0: \n",
    "                precision.append(n(num_hit/num_false))\n",
    "                lift.append(n(num_hit/num_false)/n(num_in_class/len(test_examples)))\n",
    "            else:\n",
    "                precision.append(0.0)\n",
    "                lift.append(0.0)\n",
    "            \n",
    "results_df = pd.DataFrame({'support':support, 'precision':precision, 'lift':lift})\n",
    "        \n",
    "results_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "eae54efa-df74-4850-a64b-1aff582176ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[313, 183, 58, 135, 506, 318]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "support"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "73909b37-f487-4a62-b4ce-a7161de5d410",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.741214057507987,\n",
       " 0.469945355191257,\n",
       " 0.586206896551724,\n",
       " 0.600000000000000,\n",
       " 0.656126482213439,\n",
       " 0.764150943396226]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "383916ba-442f-4a7b-984b-8d1431891ca4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1.93198101971756,\n",
       " 1.22491673941863,\n",
       " 1.52795347888186,\n",
       " 1.56390532544379,\n",
       " 1.06454407151020,\n",
       " 1.23981027832795]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lift"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "34507fd2-6736-4653-9f0d-84a0c409242b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'TARGET_1': [Sex_female,\n",
       "  Pclass_2,\n",
       "  cabin_letter_C,\n",
       "  ~Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket],\n",
       " 'necessary': [(Pclass_3^Sex_female)^Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket,\n",
       "  Fare_geq_inverse_of_4_times_Age_plus_7_divided_by_4]}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba9af84-ddf2-4554-a05c-a97ab30c20f2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "SageMath 10.2",
   "language": "sage",
   "name": "sagemath"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}