{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Machine Learning project \n", "\n", "This notebook is a short version of the [end-to-end Machine Learning project](https://github.com/ageron/handson-ml3/blob/main/02_end_to_end_machine_learning_project.ipynb) provided by Aurélien Geron." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This project requires: \n", "\n", "- Python 3.7 or above\n", "- Scikit-Learn ≥ 1.0.1:" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import sys\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from pandas.plotting import scatter_matrix\n", "import matplotlib.pyplot as plt\n", "\n", "import sklearn\n", "from sklearn import set_config\n", "set_config(display='diagram')\n", "\n", "plt.rc('font', size=14)\n", "plt.rc('axes', labelsize=14, titlesize=14)\n", "plt.rc('legend', fontsize=14)\n", "plt.rc('xtick', labelsize=10)\n", "plt.rc('ytick', labelsize=10)\n", "\n", "# Check if you have the correct versions\n", "assert sklearn.__version__ >= \"1.0.1\"\n", "assert sys.version_info >= (3, 7)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "housing = pd.read_csv(\"https://raw.githubusercontent.com/kirenz/datasets/master/housing_hml3.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Overview" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "median_house_value | \n", "ocean_proximity | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "-122.23 | \n", "37.88 | \n", "41.0 | \n", "880.0 | \n", "129.0 | \n", "322.0 | \n", "126.0 | \n", "8.3252 | \n", "452600.0 | \n", "NEAR BAY | \n", "
1 | \n", "-122.22 | \n", "37.86 | \n", "21.0 | \n", "7099.0 | \n", "1106.0 | \n", "2401.0 | \n", "1138.0 | \n", "8.3014 | \n", "358500.0 | \n", "NEAR BAY | \n", "
2 | \n", "-122.24 | \n", "37.85 | \n", "52.0 | \n", "1467.0 | \n", "190.0 | \n", "496.0 | \n", "177.0 | \n", "7.2574 | \n", "352100.0 | \n", "NEAR BAY | \n", "
3 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1274.0 | \n", "235.0 | \n", "558.0 | \n", "219.0 | \n", "5.6431 | \n", "341300.0 | \n", "NEAR BAY | \n", "
4 | \n", "-122.25 | \n", "37.85 | \n", "52.0 | \n", "1627.0 | \n", "280.0 | \n", "565.0 | \n", "259.0 | \n", "3.8462 | \n", "342200.0 | \n", "NEAR BAY | \n", "
\n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
---|---|---|---|---|---|---|---|---|
longitude | \n", "20640.0 | \n", "-119.569704 | \n", "2.003532 | \n", "-124.3500 | \n", "-121.8000 | \n", "-118.4900 | \n", "-118.01000 | \n", "-114.3100 | \n", "
latitude | \n", "20640.0 | \n", "35.631861 | \n", "2.135952 | \n", "32.5400 | \n", "33.9300 | \n", "34.2600 | \n", "37.71000 | \n", "41.9500 | \n", "
housing_median_age | \n", "20640.0 | \n", "28.639486 | \n", "12.585558 | \n", "1.0000 | \n", "18.0000 | \n", "29.0000 | \n", "37.00000 | \n", "52.0000 | \n", "
total_rooms | \n", "20640.0 | \n", "2635.763081 | \n", "2181.615252 | \n", "2.0000 | \n", "1447.7500 | \n", "2127.0000 | \n", "3148.00000 | \n", "39320.0000 | \n", "
total_bedrooms | \n", "20433.0 | \n", "537.870553 | \n", "421.385070 | \n", "1.0000 | \n", "296.0000 | \n", "435.0000 | \n", "647.00000 | \n", "6445.0000 | \n", "
population | \n", "20640.0 | \n", "1425.476744 | \n", "1132.462122 | \n", "3.0000 | \n", "787.0000 | \n", "1166.0000 | \n", "1725.00000 | \n", "35682.0000 | \n", "
households | \n", "20640.0 | \n", "499.539680 | \n", "382.329753 | \n", "1.0000 | \n", "280.0000 | \n", "409.0000 | \n", "605.00000 | \n", "6082.0000 | \n", "
median_income | \n", "20640.0 | \n", "3.870671 | \n", "1.899822 | \n", "0.4999 | \n", "2.5634 | \n", "3.5348 | \n", "4.74325 | \n", "15.0001 | \n", "
median_house_value | \n", "20640.0 | \n", "206855.816909 | \n", "115395.615874 | \n", "14999.0000 | \n", "119600.0000 | \n", "179700.0000 | \n", "264725.00000 | \n", "500001.0000 | \n", "
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d...\n", " 'households',\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d...\n", " 'households',\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('linearregression', LinearRegression())])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
LinearRegression()
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d...\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('decisiontreeregressor',\n", " DecisionTreeRegressor(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d...\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('decisiontreeregressor',\n", " DecisionTreeRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
DecisionTreeRegressor(random_state=42)
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", " param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n", " 'random_forest__max_features': [4, 6, 8]},\n", " {'preprocessing__geo__n_clusters': [10, 15],\n", " 'random_forest__max_features': [6, 8, 10]}],\n", " scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", " param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n", " 'random_forest__max_features': [4, 6, 8]},\n", " {'preprocessing__geo__n_clusters': [10, 15],\n", " 'random_forest__max_features': [6, 8, 10]}],\n", " scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>))...\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest', RandomForestRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(random_state=42)
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>))...\n", " ClusterSimilarity(n_clusters=15,\n", " random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186ef0250>)])),\n", " ('random_forest',\n", " RandomForestRegressor(max_features=6, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>))...\n", " ClusterSimilarity(n_clusters=15,\n", " random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186ef0250>)])),\n", " ('random_forest',\n", " RandomForestRegressor(max_features=6, random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(n_clusters=15,\n", " random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186ef0250>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(n_clusters=15, random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x186ef0250>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(max_features=6, random_state=42)
\n", " | n_clusters | \n", "max_features | \n", "split0 | \n", "split1 | \n", "split2 | \n", "mean_test_rmse | \n", "
---|---|---|---|---|---|---|
12 | \n", "15 | \n", "6 | \n", "43460 | \n", "43919 | \n", "44748 | \n", "44042 | \n", "
13 | \n", "15 | \n", "8 | \n", "44132 | \n", "44075 | \n", "45010 | \n", "44406 | \n", "
14 | \n", "15 | \n", "10 | \n", "44374 | \n", "44286 | \n", "45316 | \n", "44659 | \n", "
7 | \n", "10 | \n", "6 | \n", "44683 | \n", "44655 | \n", "45657 | \n", "44999 | \n", "
9 | \n", "10 | \n", "6 | \n", "44683 | \n", "44655 | \n", "45657 | \n", "44999 | \n", "
RandomizedSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<fu...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", " param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x186f11a00>,\n", " 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x186ec66a0>},\n", " random_state=42, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<fu...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", " param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x186f11a00>,\n", " 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x186ec66a0>},\n", " random_state=42, scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>))...\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])),\n", " ('random_forest', RandomForestRegressor(random_state=42))])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x186eac9d0>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x186e9fd30>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
RandomForestRegressor(random_state=42)
\n", " | n_clusters | \n", "max_features | \n", "split0 | \n", "split1 | \n", "split2 | \n", "mean_test_rmse | \n", "
---|---|---|---|---|---|---|
0 | \n", "9 | \n", "5 | \n", "45311 | \n", "44727 | \n", "46099 | \n", "45379 | \n", "
4 | \n", "9 | \n", "3 | \n", "45721 | \n", "45397 | \n", "46743 | \n", "45953 | \n", "
7 | \n", "7 | \n", "5 | \n", "45887 | \n", "45597 | \n", "46504 | \n", "45996 | \n", "
3 | \n", "7 | \n", "6 | \n", "45633 | \n", "45815 | \n", "46702 | \n", "46050 | \n", "
1 | \n", "7 | \n", "8 | \n", "45887 | \n", "45820 | \n", "46809 | \n", "46172 | \n", "
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())]),\n", " param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n", " 10000.0, 30000.0],\n", " 'svr__kernel': ['linear']},\n", " {'svr__C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0,\n", " 1000.0],\n", " 'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],\n", " 'svr__kernel': ['rbf']}],\n", " scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())]),\n", " param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n", " 10000.0, 30000.0],\n", " 'svr__kernel': ['linear']},\n", " {'svr__C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0,\n", " 1000.0],\n", " 'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],\n", " 'svr__kernel': ['rbf']}],\n", " scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x17ef9de50>))...\n", " 'total_rooms', 'population',\n", " 'households',\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
SVR()
RandomizedSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<fu...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())]),\n", " param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x17ebcf070>,\n", " 'svr__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x17ebf1be0>,\n", " 'svr__kernel': ['linear', 'rbf']},\n", " random_state=42, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<fu...\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())]),\n", " param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x17ebcf070>,\n", " 'svr__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x17ebf1be0>,\n", " 'svr__kernel': ['linear', 'rbf']},\n", " random_state=42, scoring='neg_root_mean_squared_error')
Pipeline(steps=[('preprocessing',\n", " ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x17ef9de50>))...\n", " 'total_rooms', 'population',\n", " 'households',\n", " 'median_income']),\n", " ('geo',\n", " ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])),\n", " ('svr', SVR())])
ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " transformers=[('bedrooms_ratio',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", " FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)),\n", " ('standardscaler',\n", " StandardScaler...\n", " ['total_bedrooms', 'total_rooms', 'population',\n", " 'households', 'median_income']),\n", " ('geo', ClusterSimilarity(random_state=42),\n", " ['latitude', 'longitude']),\n", " ('cat',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " <sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>)])
['total_bedrooms', 'total_rooms']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['total_rooms', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['population', 'households']
SimpleImputer(strategy='median')
FunctionTransformer(func=<function column_ratio at 0x17ef9de50>)
StandardScaler()
['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']
SimpleImputer(strategy='median')
FunctionTransformer(func=<ufunc 'log'>)
StandardScaler()
['latitude', 'longitude']
ClusterSimilarity(random_state=42)
<sklearn.compose._column_transformer.make_column_selector object at 0x17f06eaf0>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
['housing_median_age']
SimpleImputer(strategy='median')
StandardScaler()
SVR()