{ "cells": [ { "cell_type": "markdown", "id": "acf961fd-2b08-491e-93be-e1404aa10029", "metadata": {}, "source": [ "# Titanic Example\n", "\n", "Load libraries." ] }, { "cell_type": "code", "execution_count": 1, "id": "045e0f5b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2045038/3360833649.py:1: DeprecationWarning: \n", "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", "but was not found to be installed on your system.\n", "If this would cause problems for you,\n", "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", " \n", " import pandas as pd\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sys\n", "from sklearn.model_selection import train_test_split\n", "load(\"conjecturing.py\")" ] }, { "cell_type": "markdown", "id": "c561be47-a291-4492-baa8-5cc698c879cb", "metadata": {}, "source": [ "Specify output files." ] }, { "cell_type": "code", "execution_count": 2, "id": "4861be5f-d0ae-4a84-b768-b6903560f130", "metadata": {}, "outputs": [], "source": [ "inv_file = open(\"2022_12_07_inv.txt\", \"w\")\n", "prop_file = open(\"2022_12_07_prop.txt\", \"w\")" ] }, { "cell_type": "markdown", "id": "f5db5172-0de7-4bfa-ad8e-6804f0b1798a", "metadata": { "tags": [] }, "source": [ "Specify the number of examples to use for conjecturing and skips." ] }, { "cell_type": "code", "execution_count": 3, "id": "cc76b0a8-7d39-4ce4-aecc-7495a82aaafa", "metadata": {}, "outputs": [], "source": [ "num_train = 10\n", "my_skips = 0.3" ] }, { "cell_type": "markdown", "id": "c229b455-e876-4ce7-a910-c6766f168ecf", "metadata": {}, "source": [ "Read data. " ] }, { "cell_type": "code", "execution_count": 4, "id": "e449e88a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(891, 11)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " Survived Pclass \\\n", "PassengerId \n", "1 0 3 \n", "2 1 1 \n", "3 1 3 \n", "4 1 1 \n", "5 0 3 \n", "\n", " Name Sex Age \\\n", "PassengerId \n", "1 Braund, Mr. Owen Harris male 22.0 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", "3 Heikkinen, Miss. Laina female 26.0 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", "5 Allen, Mr. William Henry male 35.0 \n", "\n", " SibSp Parch Ticket Fare Cabin Embarked \n", "PassengerId \n", "1 1 0 A/5 21171 7.2500 NaN S \n", "2 1 0 PC 17599 71.2833 C85 C \n", "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", "4 1 0 113803 53.1000 C123 S \n", "5 0 0 373450 8.0500 NaN S " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_data =pd.read_csv(\"train.csv\",\n", " index_col=int(0),\n", " header=int(0)\n", " )\n", "print(my_data.shape)\n", "my_data.head()" ] }, { "cell_type": "markdown", "id": "cf280d83-a24c-4458-ab72-66a115e7cf99", "metadata": {}, "source": [ "Check the data types of the columns. For categorical data, make sure the type is integer or objects. Make sure the categories do not contain special characters besides numbers and \"_\"." ] }, { "cell_type": "code", "execution_count": 5, "id": "17f4debf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 891 entries, 1 to 891\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Survived 891 non-null int64 \n", " 1 Pclass 891 non-null int64 \n", " 2 Name 891 non-null object \n", " 3 Sex 891 non-null object \n", " 4 Age 714 non-null float64\n", " 5 SibSp 891 non-null int64 \n", " 6 Parch 891 non-null int64 \n", " 7 Ticket 891 non-null object \n", " 8 Fare 891 non-null float64\n", " 9 Cabin 204 non-null object \n", " 10 Embarked 889 non-null object \n", "dtypes: float64(2), int64(4), object(5)\n", "memory usage: 83.5+ KB\n" ] } ], "source": [ "my_data.info()" ] }, { "cell_type": "markdown", "id": "622c9753-c59a-4cb8-bfd4-bb03f9c5434b", "metadata": {}, "source": [ "Pandas thinks Survived and Pclass are integers/numeric, but they are categorical. Recast them as objects." ] }, { "cell_type": "code", "execution_count": 6, "id": "bc565d07", "metadata": {}, "outputs": [], "source": [ "my_data = my_data.astype({\"Survived\": object, \"Pclass\": object})" ] }, { "cell_type": "markdown", "id": "7c50de95-24d6-4bcc-be94-21b42beb3dd2", "metadata": {}, "source": [ "Create a new feature which is the first letter of the cabin." ] }, { "cell_type": "code", "execution_count": 7, "id": "77108245", "metadata": {}, "outputs": [], "source": [ "my_data[\"cabin_letter\"]=my_data[\"Cabin\"].str[:1]" ] }, { "cell_type": "markdown", "id": "b5b3c159-cac7-4599-8680-8c39c381da77", "metadata": {}, "source": [ "Identify invariant and categorical columns and the target column. The target should be in one of the lists." ] }, { "cell_type": "code", "execution_count": 8, "id": "820e9888", "metadata": {}, "outputs": [], "source": [ "invariant_names=[\"Age\", \"Fare\", \"SibSp\", \"Parch\"] \n", "categorical_names=[\"Survived\", \"Sex\",\"Pclass\", \"cabin_letter\", \"Embarked\"]\n", "target = \"Survived\"" ] }, { "cell_type": "markdown", "id": "88b339af-22c1-415b-8faa-fbfaa3437f71", "metadata": {}, "source": [ "Generic code starts here. Rename target column. Select columns." ] }, { "cell_type": "code", "execution_count": 9, "id": "5ae99a05-9075-4ce7-9bb7-bb7137c3cb44", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['TARGET', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',\n", " 'Fare', 'Cabin', 'Embarked', 'cabin_letter'],\n", " dtype='object')\n", "['TARGET', 'Sex', 'Pclass', 'cabin_letter', 'Embarked']\n" ] } ], "source": [ "if target in categorical_names:\n", " categorical_names[categorical_names.index(target)] = \"TARGET\"\n", "else:\n", " invariant_names[invariant_names.index(target)] = \"TARGET\"\n", "\n", "my_data = my_data.rename(columns={target: \"TARGET\"})\n", "print(my_data.columns)\n", "my_data = my_data[invariant_names + categorical_names]\n", "print(categorical_names)" ] }, { "cell_type": "markdown", "id": "94bcbfd3-514a-4a5b-bbea-9ffdd72425c1", "metadata": {}, "source": [ "Convert categorical variables to dummies. One dummy for each binary variable and one dummy for each level for variables with more than two levels." ] }, { "cell_type": "code", "execution_count": 10, "id": "daa9681d-dcc4-4600-86c2-8926b2ba2bd1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFareSibSpParchTARGET_0TARGET_1TARGET_nanSex_femaleSex_maleSex_nan...cabin_letter_Dcabin_letter_Ecabin_letter_Fcabin_letter_Gcabin_letter_Tcabin_letter_nanEmbarked_CEmbarked_QEmbarked_SEmbarked_nan
PassengerId
122.07.250010100010...0000010010
238.071.283310010100...0000001000
326.07.925000010100...0000010010
435.053.100010010100...0000000010
535.08.050000100010...0000010010
\n", "

5 rows × 27 columns

\n", "
" ], "text/plain": [ " Age Fare SibSp Parch TARGET_0 TARGET_1 TARGET_nan \\\n", "PassengerId \n", "1 22.0 7.2500 1 0 1 0 0 \n", "2 38.0 71.2833 1 0 0 1 0 \n", "3 26.0 7.9250 0 0 0 1 0 \n", "4 35.0 53.1000 1 0 0 1 0 \n", "5 35.0 8.0500 0 0 1 0 0 \n", "\n", " Sex_female Sex_male Sex_nan ... cabin_letter_D \\\n", "PassengerId ... \n", "1 0 1 0 ... 0 \n", "2 1 0 0 ... 0 \n", "3 1 0 0 ... 0 \n", "4 1 0 0 ... 0 \n", "5 0 1 0 ... 0 \n", "\n", " cabin_letter_E cabin_letter_F cabin_letter_G cabin_letter_T \\\n", "PassengerId \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "5 0 0 0 0 \n", "\n", " cabin_letter_nan Embarked_C Embarked_Q Embarked_S \\\n", "PassengerId \n", "1 1 0 0 1 \n", "2 0 1 0 0 \n", "3 1 0 0 1 \n", "4 0 0 0 1 \n", "5 1 0 0 1 \n", "\n", " Embarked_nan \n", "PassengerId \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 \n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "property_names = []\n", "for col in categorical_names:\n", " if col != \"TARGET\":\n", " unique_vals=list(my_data[col].unique()) # if nan is a level\n", " #unique_vals=list(my_data[col].dropna().unique()) # if nan is not a level\n", " if len(unique_vals)==2: # just use one level for binary features\n", " property_names.append(col+\"_\"+str(unique_vals[1]))\n", " elif len(unique_vals) > 2: #one property for each level.\n", " for level in unique_vals:\n", " property_names.append(col+\"_\"+str(level))\n", "\n", "\n", "if \"TARGET\" in categorical_names:\n", " target_property_names = []\n", " unique_vals = list(my_data[\"TARGET\"].unique()) # if nan is a level\n", " #unique_vals = list(my_data[\"TARGET\"].dropna().unique()) # if nan is not a level\n", " if len(unique_vals)==2:\n", " target_property_names.append(\"TARGET_\"+str(unique_vals[1]))\n", " elif len(unique_vals) > 2:\n", " for level in unique_vals:\n", " target_property_names.append(\"TARGET_\"+str(level))\n", " \n", "my_df = pd.get_dummies(my_data, \n", " columns=categorical_names,\n", " dtype=np.uint8,\n", " dummy_na=True, # False is the default. If False, use dropna() above\n", " drop_first=False) # False is the default\n", "\n", "my_df = my_df.rename(lambda col: col.replace('.0', ''), axis='columns')\n", "my_df.head()" ] }, { "cell_type": "markdown", "id": "2c9c7a94-00ee-49df-8b80-90ece844bee9", "metadata": {}, "source": [ "Define class, invariants, properties, and target properties (if applicable)." ] }, { "cell_type": "code", "execution_count": 11, "id": "5aaf4118", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Sex_female', 'Pclass_3', 'Pclass_1', 'Pclass_2', 'cabin_letter_nan', 'cabin_letter_C', 'cabin_letter_E', 'cabin_letter_G', 'cabin_letter_D', 'cabin_letter_A', 'cabin_letter_B', 'cabin_letter_F', 'cabin_letter_T', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Embarked_nan']\n" ] } ], "source": [ "class Example():\n", " def __init__(self, name, mydf):\n", " self.name = name\n", " self.mydf = mydf\n", " \n", "for i in invariant_names:\n", " inv = build_inv(i)\n", " setattr(Example,inv.__name__,inv )\n", "\n", "for i in property_names:\n", " prop = build_prop(i)\n", " setattr(Example, prop.__name__,prop)\n", "\n", "if \"TARGET\" in categorical_names:\n", " for i in target_property_names:\n", " prop = build_prop(i)\n", " setattr(Example, prop.__name__, prop)\n", "else:\n", " target_invariant = invariant_names.index(\"TARGET\")\n", "print(property_names)" ] }, { "cell_type": "markdown", "id": "1a474cff-5048-45c0-8cdb-f28ae59e6215", "metadata": {}, "source": [ "Split into training and testing data." ] }, { "cell_type": "code", "execution_count": 12, "id": "a84753d5-f6ff-44a2-ae83-e084938808ab", "metadata": {}, "outputs": [], "source": [ "if \"TARGET\" in categorical_names:\n", " X_train, X_test = train_test_split(\n", " my_df.index,\n", " stratify=my_data[\"TARGET\"], # stratify on target levels\n", " train_size=num_train,\n", " random_state=12345\n", " )\n", "else:\n", " X_train, X_test = train_test_split(\n", " my_df.index,\n", " train_size=num_train,\n", " random_state=12345\n", " )" ] }, { "cell_type": "markdown", "id": "41f925f4-f870-46bc-9778-7935914574a7", "metadata": {}, "source": [ "Create examples for conjecturing." ] }, { "cell_type": "code", "execution_count": 13, "id": "fee01df8", "metadata": {}, "outputs": [], "source": [ "train_examples = [Example(i, my_df) for i in X_train]\n", "test_examples = [Example(i, my_df) for i in X_test]" ] }, { "cell_type": "markdown", "id": "62bfa521-f831-45f2-86b2-9cd69f2cc4e8", "metadata": {}, "source": [ "Get lists of invariant and property functions." ] }, { "cell_type": "code", "execution_count": 14, "id": "3f252376", "metadata": {}, "outputs": [], "source": [ "invariants =[]\n", "for i in invariant_names:\n", " invariants.append(Example.__dict__[i])\n", "properties=[]\n", "for i in property_names:\n", " properties.append(Example.__dict__[i])\n", "target_properties=[]\n", "if \"TARGET\" in categorical_names:\n", " for i in target_property_names:\n", " target_properties.append(Example.__dict__[i])\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "0367a44b-736d-4388-8e63-8307607f1f8b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFareSibSpParchTARGET_0TARGET_1TARGET_nanSex_femaleSex_maleSex_nan...cabin_letter_Dcabin_letter_Ecabin_letter_Fcabin_letter_Gcabin_letter_Tcabin_letter_nanEmbarked_CEmbarked_QEmbarked_SEmbarked_nan
PassengerId
122.07.250010100010...0000010010
238.071.283310010100...0000001000
326.07.925000010100...0000010010
435.053.100010010100...0000000010
535.08.050000100010...0000010010
..................................................................
88727.013.000000100010...0000010010
88819.030.000000010100...0000000010
889NaN23.450012100100...0000010010
89026.030.000000010010...0000001000
89132.07.750000100010...0000010100
\n", "

891 rows × 27 columns

\n", "
" ], "text/plain": [ " Age Fare SibSp Parch TARGET_0 TARGET_1 TARGET_nan \\\n", "PassengerId \n", "1 22.0 7.2500 1 0 1 0 0 \n", "2 38.0 71.2833 1 0 0 1 0 \n", "3 26.0 7.9250 0 0 0 1 0 \n", "4 35.0 53.1000 1 0 0 1 0 \n", "5 35.0 8.0500 0 0 1 0 0 \n", "... ... ... ... ... ... ... ... \n", "887 27.0 13.0000 0 0 1 0 0 \n", "888 19.0 30.0000 0 0 0 1 0 \n", "889 NaN 23.4500 1 2 1 0 0 \n", "890 26.0 30.0000 0 0 0 1 0 \n", "891 32.0 7.7500 0 0 1 0 0 \n", "\n", " Sex_female Sex_male Sex_nan ... cabin_letter_D \\\n", "PassengerId ... \n", "1 0 1 0 ... 0 \n", "2 1 0 0 ... 0 \n", "3 1 0 0 ... 0 \n", "4 1 0 0 ... 0 \n", "5 0 1 0 ... 0 \n", "... ... ... ... ... ... \n", "887 0 1 0 ... 0 \n", "888 1 0 0 ... 0 \n", "889 1 0 0 ... 0 \n", "890 0 1 0 ... 0 \n", "891 0 1 0 ... 0 \n", "\n", " cabin_letter_E cabin_letter_F cabin_letter_G cabin_letter_T \\\n", "PassengerId \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "5 0 0 0 0 \n", "... ... ... ... ... \n", "887 0 0 0 0 \n", "888 0 0 0 0 \n", "889 0 0 0 0 \n", "890 0 0 0 0 \n", "891 0 0 0 0 \n", "\n", " cabin_letter_nan Embarked_C Embarked_Q Embarked_S \\\n", "PassengerId \n", "1 1 0 0 1 \n", "2 0 1 0 0 \n", "3 1 0 0 1 \n", "4 0 0 0 1 \n", "5 1 0 0 1 \n", "... ... ... ... ... \n", "887 1 0 0 1 \n", "888 0 0 0 1 \n", "889 1 0 0 1 \n", "890 0 1 0 0 \n", "891 1 0 1 0 \n", "\n", " Embarked_nan \n", "PassengerId \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 \n", "... ... \n", "887 0 \n", "888 0 \n", "889 0 \n", "890 0 \n", "891 0 \n", "\n", "[891 rows x 27 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_df" ] }, { "cell_type": "markdown", "id": "5b513bc3-3f57-4c7b-8825-c3a9d19527b9", "metadata": {}, "source": [ "Invariant conjecturing - upper and lower bounds." ] }, { "cell_type": "code", "execution_count": 16, "id": "24ae1792", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TARGET_1\n", "28\n", "TARGET_1 False\n", "41\n" ] } ], "source": [ "#define operators for expression tree to build upper bounsand lower bouns for each class\n", "use_operators = { '-1', '+1', '*2', '/2', '^2', '-()', '1/', \n", " 'sqrt', 'ln', 'log10', 'exp', '10^', 'ceil', \n", " 'floor', 'abs', '+', '*', 'max', 'min', '-', '/', '^'}\n", "\n", "inv_conjectures = []\n", "\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(value)\n", " target_property = Example.__dict__[value]\n", " my_examples = [example for example in train_examples if target_property(example) == True]\n", " for inv in invariants:\n", " sys.stdout.flush()\n", " inv_of_interest = invariants.index(inv)\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips\n", " )\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", " print(len(inv_conjectures))\n", " if len(target_property_names) == 1:\n", " value = target_property_names[0]\n", " print(value + \" False\")\n", " target_property = Example.__dict__[value]\n", " my_examples = [example for example in train_examples if target_property(example) == False]\n", " for inv in invariants:\n", " sys.stdout.flush()\n", " inv_of_interest = invariants.index(inv)\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips\n", " )\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " inv_of_interest, \n", " operators=use_operators, \n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", "else: # target is an invariant\n", " my_examples = [example for example in train_examples]\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " target_invariant, \n", " operators=use_operators, \n", " upperBound=True, \n", " time=Integer(5)\n", " # ,debug=True,\n", " # verbose=True,\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs\n", " conjs = conjecture(my_examples, \n", " invariants, \n", " target_invariant, \n", " operators=use_operators,\n", " upperBound=False, \n", " time=Integer(5)\n", " ,skips=my_skips)\n", " convert_conjecture_names(conjs)\n", " inv_conjectures += conjs \n", "print(len(inv_conjectures)) \n", "\n", "for c in inv_conjectures:\n", " inv_file.write(\"%s\\n\" % c)\n", " inv_file.flush()\n", "inv_file.close()\n" ] }, { "cell_type": "markdown", "id": "2fc6d1d3-75e2-43e5-ad29-5e1669954216", "metadata": {}, "source": [ "Property conjecturing - sufficient conditions for a categorical target values. For a binary target, get sufficient conditions for the positive class and necessary conditions for the negative class." ] }, { "cell_type": "code", "execution_count": 17, "id": "9c8befb8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TARGET_1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: invalid value encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(Sex_female)->(TARGET_1)\n", "(Pclass_2)->(TARGET_1)\n", "(cabin_letter_C)->(TARGET_1)\n", "(~(Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket))->(TARGET_1)\n", "TARGET_1 Necessary\n", "(TARGET_1)->(Age_leq_2_divided_by_SibSp)\n", "(TARGET_1)->((Fare_leq_maximumopen_bracket_logopen_bracket_Age_close_bracket_or_logopen_bracket__minus_SibSp_close_bracket_close_bracket_squared)->(Sex_female))\n" ] } ], "source": [ "all_properties = [\"TARGET\"] + properties + inv_conjectures #\"TARGET\" is just a placeholder\n", "prop_conjs = []\n", "conditions = {}\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(value)\n", " all_properties[0] = Example.__dict__[value]\n", " #print(all_properties)\n", " these_prop_conjs = propertyBasedConjecture(objects=train_examples, # edit here 6/27/23\n", " properties = all_properties,\n", " mainProperty=0,\n", " #verbose=True,\n", " #debug=True,\n", " skips=my_skips)\n", " conditions[value] = []\n", " for c in these_prop_conjs: # edit here 6/27/23 just get premises once\n", " conditions[value].append(get_premise(c, myprint=False))\n", " prop_conjs += these_prop_conjs\n", " if len(target_property_names) == 1:\n", " print(value + \" Necessary\")\n", " all_properties[0] = Example.__dict__[value]\n", " these_prop_conjs = propertyBasedConjecture(objects=train_examples, # edit here 6/27/23\n", " properties = all_properties,\n", " mainProperty=0,\n", " sufficient=False,\n", " #verbose=True,\n", " # debug=True,\n", " skips=my_skips)\n", " conditions[\"necessary\"] = []\n", " for c in these_prop_conjs:\n", " conditions[\"necessary\"].append(get_conclusion(c, myprint=False))\n", " prop_conjs += these_prop_conjs # edit here 6/27/23\n", " \n", "for c in prop_conjs:\n", " prop_file.write(\"%s\\n\" % convert_name_back(c.__name__))\n", " prop_file.flush()\n", " \n", "prop_file.close()" ] }, { "cell_type": "markdown", "id": "9e3b80c5-fa02-4d58-bfe2-b44ff0b81666", "metadata": {}, "source": [ "Apply conjectures to train and test data if target is categorical." ] }, { "cell_type": "code", "execution_count": 18, "id": "f3ab0127-5bef-46ca-9f52-f0a27f35e55a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "./conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: invalid value encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: invalid value encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'TARGET_1': [Sex_female, Pclass_2, cabin_letter_C, ~Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket], 'necessary': [Age_leq_2_divided_by_SibSp, Fare_leq_maximumopen_bracket_logopen_bracket_Age_close_bracket_or_logopen_bracket__minus_SibSp_close_bracket_close_bracket_squared->Sex_female]}\n" ] } ], "source": [ "X_train_df = my_df.loc[X_train,property_names+invariant_names] # drop target and one level for each binary variable\n", "X_test_df = my_df.loc[X_test,property_names+invariant_names]\n", "y_train_df = my_data.loc[X_train,\"TARGET\"] # get original target, even if it is multiple levels\n", "y_test_df = my_data.loc[X_test, \"TARGET\"]\n", "if \"TARGET\" in categorical_names:\n", " index = 0\n", " for value in target_property_names:\n", " for condition in conditions[value]:\n", " index += 1\n", " X_train_df['conj_' + str(index)] = [condition(example) for example in train_examples]\n", " X_test_df['conj_' + str(index)] = [condition(example) for example in test_examples]\n", " if len(target_property_names) == 1:\n", " for condition in conditions[\"necessary\"]:\n", " index += 1\n", " X_train_df['conj_' + str(index)] = [condition(example) for example in train_examples]\n", " X_test_df['conj_' + str(index)] = [condition(example) for example in test_examples]\n", " \n", " X_train_df.head()\n", " #y_train_df.head() \n", " print(conditions)" ] }, { "cell_type": "code", "execution_count": 19, "id": "62030470-d62d-4555-80a1-c2b7094c4622", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PassengerId\n", "861 0\n", "481 0\n", "321 0\n", "182 0\n", "565 0\n", " ..\n", "845 0\n", "89 1\n", "748 1\n", "33 1\n", "606 0\n", "Name: TARGET, Length: 881, dtype: object\n" ] } ], "source": [ "y_test_df\n", "target_property_names\n", "for value in target_property_names:\n", " this_value = value.replace(\"TARGET_\", \"\")\n", " print(y_test_df.astype('str'))" ] }, { "cell_type": "markdown", "id": "b410f6ff-3cff-41f6-aaab-7a60f6f6cbe3", "metadata": {}, "source": [ "Calculate support, precision, recall, lift, and F1. The F1 score is only for the class for the sufficient condition it was derived for." ] }, { "cell_type": "code", "execution_count": 21, "id": "f9e29f39-e7e2-46bb-8154-ec731a18109c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "value: TARGET_1\n", "0 condition: Sex_female\n", "1 condition: Pclass_2\n", "3 condition: cabin_letter_C\n", "6 condition: ~Fare_leq_10_to_the_power_e_to_the_power_open_bracket_e_to_the_power_SibSp_divided_by_10_to_the_power_Parch_close_bracket\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "./conjecturing.py:279: RuntimeWarning: overflow encountered in scalar power\n", " return (lambda x: 10**x), 1\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "6 condition: Age_leq_2_divided_by_SibSp\n", "7 condition: Fare_leq_maximumopen_bracket_logopen_bracket_Age_close_bracket_or_logopen_bracket__minus_SibSp_close_bracket_close_bracket_squared->Sex_female\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: invalid value encountered in log\n", " return ln(args[0], **kwds)\n", "/lustre/home/clarson/anaconda3/envs/sage/lib/python3.11/site-packages/sage/misc/functional.py:1209: RuntimeWarning: divide by zero encountered in log\n", " return ln(args[0], **kwds)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classsupportprecisionrecallliftf1
0TARGET_13130.7412140575079870.6863905325443791.931981019717560.712749615975422
1TARGET_11830.4699453551912570.2544378698224851.224916739418630.330134357005758
2TARGET_1580.5862068965517240.1005917159763311.527953478881860.171717171717172
3TARGET_11350.6000000000000000.2396449704142011.563905325443790.342494714587738
4necessary4060.6133004926108370.7366863905325440.9950602835914320.734513274336283
5necessary2940.8775510204081630.7633136094674561.423798248581200.761061946902655
\n", "
" ], "text/plain": [ " class support precision recall lift \\\n", "0 TARGET_1 313 0.741214057507987 0.686390532544379 1.93198101971756 \n", "1 TARGET_1 183 0.469945355191257 0.254437869822485 1.22491673941863 \n", "2 TARGET_1 58 0.586206896551724 0.100591715976331 1.52795347888186 \n", "3 TARGET_1 135 0.600000000000000 0.239644970414201 1.56390532544379 \n", "4 necessary 406 0.613300492610837 0.736686390532544 0.995060283591432 \n", "5 necessary 294 0.877551020408163 0.763313609467456 1.42379824858120 \n", "\n", " f1 \n", "0 0.712749615975422 \n", "1 0.330134357005758 \n", "2 0.171717171717172 \n", "3 0.342494714587738 \n", "4 0.734513274336283 \n", "5 0.761061946902655 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "support = []\n", "lift = []\n", "precision = []\n", "recall = []\n", "f1 = []\n", "classes = []\n", "count = 0\n", "if \"TARGET\" in categorical_names:\n", " for value in target_property_names:\n", " print(\"value: {}\".format(value))\n", " this_value = value.replace(\"TARGET_\", \"\")\n", " my_function = getattr(Example, value)\n", " for i, condition in enumerate(conditions[value]):\n", " count = count+i\n", " print(count, \"condition: {}\".format(condition))\n", " classes.append(value)\n", " num_true = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == True:\n", " num_true += 1\n", " if my_function(example) == True:\n", " num_hit += 1\n", " if my_function(example) == True:\n", " num_in_class += 1\n", " support.append(num_true)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_true))\n", " lift.append(n(num_hit/num_true)/n(num_in_class/len(test_examples)))\n", " recall.append(n(num_hit/sum(y_test_df.astype('str') == this_value)))\n", " my_precision = n(num_hit/num_true)\n", " my_recall = n(num_hit/sum(y_test_df.astype('str') == this_value))\n", " f1.append((2*my_precision*my_recall)/(my_precision + my_recall))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " recall.append(0.0)\n", " f1.append(0.0)\n", " if len(target_property_names) == 1:\n", " for i, condition in enumerate(conditions[\"necessary\"]):\n", " count = count+i\n", " print(count, \"condition: {}\".format(condition))\n", " classes.append(\"necessary\")\n", " num_false = 0\n", " num_in_class = 0\n", " num_hit = 0\n", " for example in test_examples:\n", " if condition(example) == False:\n", " num_false += 1\n", " if my_function(example) == False:\n", " num_hit += 1\n", " if my_function(example) == False:\n", " num_in_class += 1\n", " support.append(num_false)\n", " if num_hit > 0: \n", " precision.append(n(num_hit/num_false))\n", " lift.append(n(num_hit/num_false)/n(num_in_class/len(test_examples)))\n", " recall.append(n(num_hit/(len(test_examples) - sum(y_test_df.astype('str') != this_value))))\n", " my_precision = n(num_hit/num_true)\n", " my_recall = n(num_hit/sum(y_test_df.astype('str') != this_value))\n", " f1.append((2*my_precision*my_recall)/(my_precision + my_recall))\n", " else:\n", " precision.append(0.0)\n", " lift.append(0.0)\n", " recall.append(0.0)\n", " f1.append(0.0)\n", " \n", "results_df = pd.DataFrame({\n", " 'class': classes,\n", " 'support':support, \n", " 'precision':precision, \n", " 'recall': recall, \n", " 'lift':lift, \n", " 'f1': f1})\n", " \n", "results_df" ] }, { "cell_type": "code", "execution_count": null, "id": "eae54efa-df74-4850-a64b-1aff582176ea", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "SageMath 10.2", "language": "sage", "name": "sagemath" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }