{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Fitting a line\n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Data" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Import data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/possum.csv')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Data structure" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sitepopsexagehead_lskull_wtotal_ltail_l
01Vicm8.094.160.489.036.0
11Vicf6.092.557.691.536.5
21Vicf6.094.060.095.539.0
31Vicf6.093.257.192.038.0
41Vicf2.091.556.385.536.0
...........................
997otherm1.089.556.081.536.5
1007otherm1.088.654.782.539.0
1017otherf6.092.455.089.038.0
1027otherm4.091.555.282.536.5
1037otherf3.093.659.989.040.0
\n", "

104 rows × 8 columns

\n", "
" ], "text/plain": [ " site pop sex age head_l skull_w total_l tail_l\n", "0 1 Vic m 8.0 94.1 60.4 89.0 36.0\n", "1 1 Vic f 6.0 92.5 57.6 91.5 36.5\n", "2 1 Vic f 6.0 94.0 60.0 95.5 39.0\n", "3 1 Vic f 6.0 93.2 57.1 92.0 38.0\n", "4 1 Vic f 2.0 91.5 56.3 85.5 36.0\n", ".. ... ... .. ... ... ... ... ...\n", "99 7 other m 1.0 89.5 56.0 81.5 36.5\n", "100 7 other m 1.0 88.6 54.7 82.5 39.0\n", "101 7 other f 6.0 92.4 55.0 89.0 38.0\n", "102 7 other m 4.0 91.5 55.2 82.5 36.5\n", "103 7 other f 3.0 93.6 59.9 89.0 40.0\n", "\n", "[104 rows x 8 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 104 entries, 0 to 103\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 site 104 non-null int64 \n", " 1 pop 104 non-null object \n", " 2 sex 104 non-null object \n", " 3 age 102 non-null float64\n", " 4 head_l 104 non-null float64\n", " 5 skull_w 104 non-null float64\n", " 6 total_l 104 non-null float64\n", " 7 tail_l 104 non-null float64\n", "dtypes: float64(5), int64(1), object(2)\n", "memory usage: 6.6+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Variable lists" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Prepara data for scikit-learn model:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "y_label = \"head_l\"\n", "\n", "X = df[[\"total_l\"]]\n", "y = df[y_label]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Model" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "### Select model" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# Choose the linear regression model\n", "reg = LinearRegression()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "### Fit model" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LinearRegression()" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fit the model to the data\n", "reg.fit(X, y)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Coefficients" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "42.70979314896378" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Intercept\n", "reg.intercept_" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([0.57290128])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Slope\n", "reg.coef_" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Make predictions" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# Make predictions on the data\n", "y_pred = reg.predict(X)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Evaluation" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "#### Mean squared error" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "6.6061634260446445" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_squared_error(y, y_pred)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "#### Root mean squared error" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "2.570245790978879" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_squared_error(y, y_pred, squared=False)" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3.9.13 ('ds')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "vscode": { "interpreter": { "hash": "0de8387c967863cc622aba8b7ea5b466d4dfde089153d484429677aa77034389" } } }, "nbformat": 4, "nbformat_minor": 2 }