{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Processing data\n",
"\n",
"Many ML algorithms need data that is numeric, complete (no missing) and standardized. Ensembles of trees are the most accommodating and require the least data processing."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"url = (\n",
" 'http://biostat.mc.vanderbilt.edu/' \n",
" 'wiki/pub/Main/DataSets/titanic3.xls'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_excel(url)\n",
"df_orig = df.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Basic inspection"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" survived | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home.dest | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1117 | \n",
" 3 | \n",
" 0 | \n",
" Pekoniemi, Mr. Edvard | \n",
" male | \n",
" 21.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O 2. 3101294 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 412 | \n",
" 2 | \n",
" 0 | \n",
" Fynney, Mr. Joseph J | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 239865 | \n",
" 26.0000 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" 322.0 | \n",
" Liverpool / Montreal, PQ | \n",
"
\n",
" \n",
" | 64 | \n",
" 1 | \n",
" 1 | \n",
" Chambers, Mr. Norman Campbell | \n",
" male | \n",
" 27.0 | \n",
" 1 | \n",
" 0 | \n",
" 113806 | \n",
" 53.1000 | \n",
" E8 | \n",
" S | \n",
" 5 | \n",
" NaN | \n",
" New York, NY / Ithaca, NY | \n",
"
\n",
" \n",
" | 650 | \n",
" 3 | \n",
" 0 | \n",
" Attalah, Miss. Malake | \n",
" female | \n",
" 17.0 | \n",
" 0 | \n",
" 0 | \n",
" 2627 | \n",
" 14.4583 | \n",
" NaN | \n",
" C | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 912 | \n",
" 3 | \n",
" 0 | \n",
" Karaic, Mr. Milan | \n",
" male | \n",
" 30.0 | \n",
" 0 | \n",
" 0 | \n",
" 349246 | \n",
" 7.8958 | \n",
" NaN | \n",
" S | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass survived name sex age sibsp \\\n",
"1117 3 0 Pekoniemi, Mr. Edvard male 21.0 0 \n",
"412 2 0 Fynney, Mr. Joseph J male 35.0 0 \n",
"64 1 1 Chambers, Mr. Norman Campbell male 27.0 1 \n",
"650 3 0 Attalah, Miss. Malake female 17.0 0 \n",
"912 3 0 Karaic, Mr. Milan male 30.0 0 \n",
"\n",
" parch ticket fare cabin embarked boat body \\\n",
"1117 0 STON/O 2. 3101294 7.9250 NaN S NaN NaN \n",
"412 0 239865 26.0000 NaN S NaN 322.0 \n",
"64 0 113806 53.1000 E8 S 5 NaN \n",
"650 0 2627 14.4583 NaN C NaN NaN \n",
"912 0 349246 7.8958 NaN S NaN NaN \n",
"\n",
" home.dest \n",
"1117 NaN \n",
"412 Liverpool / Montreal, PQ \n",
"64 New York, NY / Ithaca, NY \n",
"650 NaN \n",
"912 NaN "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 1309 entries, 0 to 1308\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 pclass 1309 non-null int64 \n",
" 1 survived 1309 non-null int64 \n",
" 2 name 1309 non-null object \n",
" 3 sex 1309 non-null object \n",
" 4 age 1046 non-null float64\n",
" 5 sibsp 1309 non-null int64 \n",
" 6 parch 1309 non-null int64 \n",
" 7 ticket 1309 non-null object \n",
" 8 fare 1308 non-null float64\n",
" 9 cabin 295 non-null object \n",
" 10 embarked 1307 non-null object \n",
" 11 boat 486 non-null object \n",
" 12 body 121 non-null float64\n",
" 13 home.dest 745 non-null object \n",
"dtypes: float64(3), int64(4), object(7)\n",
"memory usage: 143.3+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Detailed inspection"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas_profiling as pp"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f795bdbdb3664dd2b1c60d9a8419305c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=28.0, style=ProgressStyle(descrip…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9be24b7caee241e4be22631c1f2c3141",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "575dd299c8224a7c98f73af9644d9b09",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pp.ProfileReport(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create new features\n",
"\n",
"Sometimes we want to create new features from existing columns. For example, the names column can be mined to extract titles. We illustrate how to do this, but will not use this here."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Mr 757\n",
"Miss 260\n",
"Mrs 197\n",
"Master 61\n",
"Dr 8\n",
"Rev 8\n",
"Col 4\n",
"Major 2\n",
"Mlle 2\n",
"Ms 2\n",
"Don 1\n",
"Lady 1\n",
"Mme 1\n",
"Jonkheer 1\n",
"Sir 1\n",
"Capt 1\n",
"Dona 1\n",
"Countess 1\n",
"Name: title, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['title'] = df.name.str.extract('.*([A-Z][a-z]+)\\..*')\n",
"df.title.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Drop features\n",
"\n",
"These features are either uninformative or leak information about the outcome."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"target = df.survived\n",
"df = df.drop(columns = [\n",
" 'survived', 'name', 'ticket', 'cabin'\n",
" ,'boat', 'body', 'home.dest', 'title'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inspect for missing data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import missingno as mn"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"mn.matrix(df);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Fill in missing values for categorical values"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sex 0\n",
"embarked 2\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.select_dtypes('object').isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df['embarked'] = df['embarked'].fillna('')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tangent: `catboost` is nice\n",
"\n",
"Minimal processing or tuning is required to use `catboost`, making it a nice \"default\" algorithm."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"! python3 -m pip install --quiet catboost"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import catboost"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test =train_test_split(df, target, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"cb = catboost.CatBoostClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"cb.fit(X_train, y_train, \n",
" cat_features=['sex', 'embarked'],\n",
" verbose=0);"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8170731707317073"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cb.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Category encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Vanilla encoding\n",
"\n",
"For variables with only a few distinct values, one hot encoding (or dummy variables) is often used. For more values, we can use hash encoding, which is basically the same idea but bins values using a hash function.\n",
"\n",
"We may choose to drop one of the created columns to avoid multicollinearity."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" sex_male | \n",
" embarked_C | \n",
" embarked_Q | \n",
" embarked_S | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 2.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 30.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 25.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass age sibsp parch fare sex_male embarked_C embarked_Q \\\n",
"0 1 29.0000 0 0 211.3375 0 0 0 \n",
"1 1 0.9167 1 2 151.5500 1 0 0 \n",
"2 1 2.0000 1 2 151.5500 0 0 0 \n",
"3 1 30.0000 1 2 151.5500 1 0 0 \n",
"4 1 25.0000 1 2 151.5500 0 0 0 \n",
"\n",
" embarked_S \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.get_dummies(df, drop_first=True).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Target encoding\n",
"\n",
"We can use the target to find a more informative encoding. Note that these methods leak information and are prone to over-fitting."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.simplefilter('ignore', FutureWarning)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import category_encoders as ce"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"te = ce.TargetEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 2.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 30.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 25.0000 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass age sibsp parch fare\n",
"0 1 29.0000 0 0 211.3375\n",
"1 1 0.9167 1 2 151.5500\n",
"2 1 2.0000 1 2 151.5500\n",
"3 1 30.0000 1 2 151.5500\n",
"4 1 25.0000 1 2 151.5500"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"te.fit_transform(df.select_dtypes('number'), target).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split data into train and test data sets\n",
"\n",
"Before we go further, we split into test and train data sets to avoid data leakage."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(df, target)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Category encoding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### We will be conservative and avoid risk of leakage\n",
"\n",
"Note we don't bother to drop columns - multicollinearity is only a problem when fitting linear models without regularization - this is rarely done in ML (c.f. statistics)."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"ohe= ce.OneHotEncoder(cols=['sex','embarked'], use_cat_names=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"X_train = ohe.fit_transform(X_train)\n",
"X_test = ohe.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" sex_male | \n",
" sex_female | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked_S | \n",
" embarked_C | \n",
" embarked_Q | \n",
" embarked_ | \n",
"
\n",
" \n",
" \n",
" \n",
" | 414 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 34.0 | \n",
" 1 | \n",
" 0 | \n",
" 21.0000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 287 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 61.0 | \n",
" 0 | \n",
" 0 | \n",
" 32.3208 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1279 | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 14.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.8542 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 711 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 28.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.2500 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 358 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
" 42.0 | \n",
" 0 | \n",
" 0 | \n",
" 13.0000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass sex_male sex_female age sibsp parch fare embarked_S \\\n",
"414 2 1 0 34.0 1 0 21.0000 1 \n",
"287 1 1 0 61.0 0 0 32.3208 1 \n",
"1279 3 0 1 14.0 0 0 7.8542 1 \n",
"711 3 1 0 28.0 0 0 7.2500 1 \n",
"358 2 0 1 42.0 0 0 13.0000 1 \n",
"\n",
" embarked_C embarked_Q embarked_ \n",
"414 0 0 0 \n",
"287 0 0 0 \n",
"1279 0 0 0 \n",
"711 0 0 0 \n",
"358 0 0 0 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Impute missing numeric values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Vanilla imputation\n",
"\n",
"A simple imputation is to fill with mean or median."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"si = SimpleImputer(strategy='mean')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" sex_male | \n",
" sex_female | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked_S | \n",
" embarked_C | \n",
" embarked_Q | \n",
" embarked_ | \n",
"
\n",
" \n",
" \n",
" \n",
" | 414 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 34.0 | \n",
" 1 | \n",
" 0 | \n",
" 21.0000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 287 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 61.0 | \n",
" 0 | \n",
" 0 | \n",
" 32.3208 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1279 | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 14.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.8542 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass sex_male sex_female age sibsp parch fare embarked_S \\\n",
"414 2 1 0 34.0 1 0 21.0000 1 \n",
"287 1 1 0 61.0 0 0 32.3208 1 \n",
"1279 3 0 1 14.0 0 0 7.8542 1 \n",
"\n",
" embarked_C embarked_Q embarked_ \n",
"414 0 0 0 \n",
"287 0 0 0 \n",
"1279 0 0 0 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.select_dtypes('number').head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We illustrate the code but will try more fancy imputation instead.\n",
"\n",
"```python\n",
"X_train[X_train.select_dtypes('number').columns] = \\\n",
"si.fit_transform(X_train.select_dtypes('number'))\n",
"X_test[X_test.select_dtypes('number').columns] = \\\n",
"si.transform(X_test.select_dtypes('number'))\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Fancy imputation\n",
"\n",
"This basically does the same thing as `mice` in R."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.experimental import enable_iterative_imputer\n",
"from sklearn.impute import IterativeImputer"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"ii = IterativeImputer(random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"X_train[X_train.select_dtypes('number').columns] = \\\n",
"ii.fit_transform(X_train.select_dtypes('number'))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"X_test[X_test.select_dtypes('number').columns] = \\\n",
"ii.transform(X_test.select_dtypes('number'))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, 0)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.isnull().sum().sum(), X_test.isnull().sum().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Simple example to illustrate differences"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"x = np.array([\n",
" [10, 10],\n",
" [1, 1],\n",
" [2,2],\n",
" [10, 10],\n",
" [10, np.nan],\n",
" [np.nan, 10],\n",
" [np.nan, np.nan]\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[10. , 10. ],\n",
" [ 1. , 1. ],\n",
" [ 2. , 2. ],\n",
" [10. , 10. ],\n",
" [10. , 6.6],\n",
" [ 6.6, 10. ],\n",
" [ 6.6, 6.6]])"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"si.fit_transform(x)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[10. , 10. ],\n",
" [ 1. , 1. ],\n",
" [ 2. , 2. ],\n",
" [10. , 10. ],\n",
" [10. , 9.99968523],\n",
" [10.00094638, 10. ],\n",
" [ 7.1668244 , 7.1666142 ]])"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ii.fit_transform(x)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"X_train.to_csv('data/X_train_unscaled.csv', index=False)\n",
"X_test.to_csv('data/X_test_unscaled.csv', index=False)\n",
"y_train.to_csv('data/y_train_unscaled.csv', index=False)\n",
"y_test.to_csv('data/y_test_unscaled.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Standardize data"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"X_train.iloc[:, :] = scaler.fit_transform(X_train)\n",
"X_test.iloc[:, :] = scaler.transform(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save processed data for future use"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"X_train.to_csv('data/X_train.csv', index=False)\n",
"X_test.to_csv('data/X_test.csv', index=False)\n",
"y_train.to_csv('data/y_train.csv', index=False)\n",
"y_test.to_csv('data/y_test.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.5 64-bit",
"language": "python",
"name": "python38564bit02a66c47ce504b05b2ef5646cfed96c2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}