{ "cells": [ { "cell_type": "markdown", "id": "8daa7507", "metadata": {}, "source": [ "# pandas: Basic Pandas for Data Science\n" ] }, { "cell_type": "markdown", "id": "024daa28", "metadata": {}, "source": [ "## Package Import" ] }, { "cell_type": "code", "execution_count": 1, "id": "60d53240", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:19.813407Z", "iopub.status.busy": "2025-09-02T16:31:19.813258Z", "iopub.status.idle": "2025-09-02T16:31:20.026145Z", "shell.execute_reply": "2025-09-02T16:31:20.025482Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "5cddb9e8", "metadata": {}, "source": [ "## Dataset Import" ] }, { "cell_type": "markdown", "id": "78991f33", "metadata": {}, "source": [ "The dataset used in this notebook is from [Kaggle - Pokemon](https://www.kaggle.com/datasets/abcsds/pokemon)." ] }, { "cell_type": "code", "execution_count": 2, "id": "06bcb2dd", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.028131Z", "iopub.status.busy": "2025-09-02T16:31:20.027966Z", "iopub.status.idle": "2025-09-02T16:31:20.032774Z", "shell.execute_reply": "2025-09-02T16:31:20.032255Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [], "source": [ "data = pd.read_csv('data/Pokemon.csv')" ] }, { "cell_type": "markdown", "id": "33d5fbdc", "metadata": {}, "source": [ "## Manually Create a DataFrame" ] }, { "cell_type": "markdown", "id": "6188f5fc", "metadata": {}, "source": [ "From a Dictionary\\\n", "The columns order is the order of keys insertion:" ] }, { "cell_type": "code", "execution_count": 3, "id": "6e98d709", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.034493Z", "iopub.status.busy": "2025-09-02T16:31:20.034201Z", "iopub.status.idle": "2025-09-02T16:31:20.042386Z", "shell.execute_reply": "2025-09-02T16:31:20.041908Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Column 1Column 2
0100300
1200400
\n", "
" ], "text/plain": [ " Column 1 Column 2\n", "0 100 300\n", "1 200 400" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'Column 1': [100,200], 'Column 2': [300,400]})\n", "df" ] }, { "cell_type": "markdown", "id": "addb13ff", "metadata": {}, "source": [ "From a list of random values w/ column names:" ] }, { "cell_type": "code", "execution_count": 4, "id": "c9acef95", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.043588Z", "iopub.status.busy": "2025-09-02T16:31:20.043461Z", "iopub.status.idle": "2025-09-02T16:31:20.049984Z", "shell.execute_reply": "2025-09-02T16:31:20.049390Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcdefgh
00.8608980.4243860.1601100.2026030.8374600.3563850.0445390.339014
10.8376710.6018760.4434240.0885420.6838280.7258110.5673500.979437
20.5501810.0500530.7448240.2084620.8088650.4377470.9763860.227302
30.1453770.0233500.7079480.5945330.2901700.6909000.8562020.618862
\n", "
" ], "text/plain": [ " a b c d e f g \\\n", "0 0.860898 0.424386 0.160110 0.202603 0.837460 0.356385 0.044539 \n", "1 0.837671 0.601876 0.443424 0.088542 0.683828 0.725811 0.567350 \n", "2 0.550181 0.050053 0.744824 0.208462 0.808865 0.437747 0.976386 \n", "3 0.145377 0.023350 0.707948 0.594533 0.290170 0.690900 0.856202 \n", "\n", " h \n", "0 0.339014 \n", "1 0.979437 \n", "2 0.227302 \n", "3 0.618862 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(np.random.rand(4, 8), columns=list('abcdefgh'))" ] }, { "cell_type": "markdown", "id": "3f3d810a", "metadata": {}, "source": [ "From a dictionary including Series:" ] }, { "cell_type": "code", "execution_count": 5, "id": "a1212de8", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.051639Z", "iopub.status.busy": "2025-09-02T16:31:20.051459Z", "iopub.status.idle": "2025-09-02T16:31:20.056972Z", "shell.execute_reply": "2025-09-02T16:31:20.056552Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1col2
00NaN
11NaN
222.0
333.0
\n", "
" ], "text/plain": [ " col1 col2\n", "0 0 NaN\n", "1 1 NaN\n", "2 2 2.0\n", "3 3 3.0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'col1': [0,1,2,3], 'col2': pd.Series([2,3], index=[2,3])}, index=[0,1,2,3])" ] }, { "cell_type": "markdown", "id": "67d27c8c", "metadata": {}, "source": [ "From numpy ndarray:" ] }, { "cell_type": "code", "execution_count": 6, "id": "84278f58", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.058211Z", "iopub.status.busy": "2025-09-02T16:31:20.058092Z", "iopub.status.idle": "2025-09-02T16:31:20.063509Z", "shell.execute_reply": "2025-09-02T16:31:20.063013Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
0123
1456
2789
\n", "
" ], "text/plain": [ " a b c\n", "0 1 2 3\n", "1 4 5 6\n", "2 7 8 9" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),\n", " columns=['a', 'b', 'c'])\n", "df" ] }, { "cell_type": "markdown", "id": "23f92b7d", "metadata": {}, "source": [ "From a numpy ndarray that has labeled columns:" ] }, { "cell_type": "code", "execution_count": 7, "id": "da18ad79", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.064814Z", "iopub.status.busy": "2025-09-02T16:31:20.064693Z", "iopub.status.idle": "2025-09-02T16:31:20.070098Z", "shell.execute_reply": "2025-09-02T16:31:20.069611Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ca
031
164
297
\n", "
" ], "text/plain": [ " c a\n", "0 3 1\n", "1 6 4\n", "2 9 7" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = np.array([(1,2,3), (4,5,6), (7,8,9)], dtype=[(\"a\", \"i4\"), (\"b\", \"i4\"), (\"c\", \"i4\")])\n", "df = pd.DataFrame(data=d, columns=['c', 'a'])\n", "df" ] }, { "cell_type": "markdown", "id": "cbdf59d4", "metadata": {}, "source": [ "From Series/DataFrame:" ] }, { "cell_type": "code", "execution_count": 8, "id": "ea0c083a", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.071524Z", "iopub.status.busy": "2025-09-02T16:31:20.071403Z", "iopub.status.idle": "2025-09-02T16:31:20.076385Z", "shell.execute_reply": "2025-09-02T16:31:20.075899Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
hehe
c3
a1
\n", "
" ], "text/plain": [ " hehe\n", "c 3\n", "a 1" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser = pd.Series([1,2,3], index=['a','b','c'])\n", "df = pd.DataFrame(data=ser, index=['c', 'a'], columns=['hehe'])\n", "df" ] }, { "cell_type": "markdown", "id": "50577d37", "metadata": {}, "source": [ "If we construct from DataFrame, then the columns in the new DataFrame must be a subset of the original columns. If not, the new columns will be filled with NaN." ] }, { "cell_type": "code", "execution_count": 9, "id": "6d4fa3ec", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.077697Z", "iopub.status.busy": "2025-09-02T16:31:20.077567Z", "iopub.status.idle": "2025-09-02T16:31:20.083380Z", "shell.execute_reply": "2025-09-02T16:31:20.082882Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " x\n", "c 3\n", "a 1 \n", " z\n", "c NaN\n", "a NaN\n" ] } ], "source": [ "df1 = pd.DataFrame([1,2,3], index=['a','b','c'], columns=['x'])\n", "df2 = pd.DataFrame(data=df1, index=['c', 'a'])\n", "df3 = pd.DataFrame(data=df1, index=['c', 'a'], columns=['z'])\n", "print(df2, '\\n',df3)" ] }, { "cell_type": "markdown", "id": "92026a09", "metadata": {}, "source": [ "## Reverse Row/Column Order" ] }, { "cell_type": "markdown", "id": "7925717d", "metadata": {}, "source": [ "See also `iloc` [here](https://cornel05.github.io/cornel.ai/notebooks/snippets/pandas/basic_pandas/basic_pandas_1.html#iloc-and-loc-in-pandas)" ] }, { "cell_type": "code", "execution_count": 10, "id": "94368e91", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.084831Z", "iopub.status.busy": "2025-09-02T16:31:20.084689Z", "iopub.status.idle": "2025-09-02T16:31:20.092404Z", "shell.execute_reply": "2025-09-02T16:31:20.091965Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "markdown", "id": "67083771", "metadata": {}, "source": [ "### Row" ] }, { "cell_type": "code", "execution_count": 11, "id": "bfc7d252", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.093941Z", "iopub.status.busy": "2025-09-02T16:31:20.093799Z", "iopub.status.idle": "2025-09-02T16:31:20.100695Z", "shell.execute_reply": "2025-09-02T16:31:20.099981Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
799721VolcanionFireWater6008011012013090706True
798720HoopaHoopa UnboundPsychicDark6808016060170130806True
797720HoopaHoopa ConfinedPsychicGhost6008011060150130706True
796719DiancieMega DiancieRockFairy700501601101601101106True
795719DiancieRockFairy60050100150100150506True
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "799 721 Volcanion Fire Water 600 80 110 120 \n", "798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", "797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", "796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", "795 719 Diancie Rock Fairy 600 50 100 150 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "799 130 90 70 6 True \n", "798 170 130 80 6 True \n", "797 150 130 70 6 True \n", "796 160 110 110 6 True \n", "795 100 150 50 6 True " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[::-1].head()" ] }, { "cell_type": "markdown", "id": "c0768b76", "metadata": {}, "source": [ "See also `reset_index` [here](https://cornel05.github.io/cornel.ai/notebooks/snippets/pandas/basic_pandas/basic_pandas_1.html#reset-index)." ] }, { "cell_type": "code", "execution_count": 12, "id": "c16f8174", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.102163Z", "iopub.status.busy": "2025-09-02T16:31:20.102006Z", "iopub.status.idle": "2025-09-02T16:31:20.109277Z", "shell.execute_reply": "2025-09-02T16:31:20.108698Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0799721VolcanionFireWater6008011012013090706True
1798720HoopaHoopa UnboundPsychicDark6808016060170130806True
2797720HoopaHoopa ConfinedPsychicGhost6008011060150130706True
3796719DiancieMega DiancieRockFairy700501601101601101106True
4795719DiancieRockFairy60050100150100150506True
\n", "
" ], "text/plain": [ " index # Name Type 1 Type 2 Total HP Attack \\\n", "0 799 721 Volcanion Fire Water 600 80 110 \n", "1 798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 \n", "2 797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 \n", "3 796 719 DiancieMega Diancie Rock Fairy 700 50 160 \n", "4 795 719 Diancie Rock Fairy 600 50 100 \n", "\n", " Defense Sp. Atk Sp. Def Speed Generation Legendary \n", "0 120 130 90 70 6 True \n", "1 60 170 130 80 6 True \n", "2 60 150 130 70 6 True \n", "3 110 160 110 110 6 True \n", "4 150 100 150 50 6 True " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[::-1].reset_index().head()" ] }, { "cell_type": "code", "execution_count": 13, "id": "c685e70f", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.110783Z", "iopub.status.busy": "2025-09-02T16:31:20.110641Z", "iopub.status.idle": "2025-09-02T16:31:20.117294Z", "shell.execute_reply": "2025-09-02T16:31:20.116590Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0721VolcanionFireWater6008011012013090706True
1720HoopaHoopa UnboundPsychicDark6808016060170130806True
2720HoopaHoopa ConfinedPsychicGhost6008011060150130706True
3719DiancieMega DiancieRockFairy700501601101601101106True
4719DiancieRockFairy60050100150100150506True
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 721 Volcanion Fire Water 600 80 110 120 \n", "1 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", "2 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", "3 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", "4 719 Diancie Rock Fairy 600 50 100 150 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 130 90 70 6 True \n", "1 170 130 80 6 True \n", "2 150 130 70 6 True \n", "3 160 110 110 6 True \n", "4 100 150 50 6 True " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[::-1].reset_index(drop=True).head()" ] }, { "cell_type": "markdown", "id": "8a54779c", "metadata": {}, "source": [ "### Column" ] }, { "cell_type": "code", "execution_count": 14, "id": "46a3629c", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.119001Z", "iopub.status.busy": "2025-09-02T16:31:20.118748Z", "iopub.status.idle": "2025-09-02T16:31:20.125256Z", "shell.execute_reply": "2025-09-02T16:31:20.124780Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LegendaryGenerationSpeedSp. DefSp. AtkDefenseAttackHPTotalType 2Type 1Name#
0False1456565494945318PoisonGrassBulbasaur1
1False1608080636260405PoisonGrassIvysaur2
2False180100100838280525PoisonGrassVenusaur3
3False18012012212310080625PoisonGrassVenusaurMega Venusaur3
4False1655060435239309NaNFireCharmander4
\n", "
" ], "text/plain": [ " Legendary Generation Speed Sp. Def Sp. Atk Defense Attack HP Total \\\n", "0 False 1 45 65 65 49 49 45 318 \n", "1 False 1 60 80 80 63 62 60 405 \n", "2 False 1 80 100 100 83 82 80 525 \n", "3 False 1 80 120 122 123 100 80 625 \n", "4 False 1 65 50 60 43 52 39 309 \n", "\n", " Type 2 Type 1 Name # \n", "0 Poison Grass Bulbasaur 1 \n", "1 Poison Grass Ivysaur 2 \n", "2 Poison Grass Venusaur 3 \n", "3 Poison Grass VenusaurMega Venusaur 3 \n", "4 NaN Fire Charmander 4 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[:, ::-1].head()" ] }, { "cell_type": "markdown", "id": "8f01f1d0", "metadata": {}, "source": [ "## Select column by data type" ] }, { "cell_type": "markdown", "id": "84e77d1e", "metadata": {}, "source": [ "Check columns data types:" ] }, { "cell_type": "code", "execution_count": 15, "id": "99732276", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.126644Z", "iopub.status.busy": "2025-09-02T16:31:20.126511Z", "iopub.status.idle": "2025-09-02T16:31:20.130707Z", "shell.execute_reply": "2025-09-02T16:31:20.130186Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "# int64\n", "Name object\n", "Type 1 object\n", "Type 2 object\n", "Total int64\n", "HP int64\n", "Attack int64\n", "Defense int64\n", "Sp. Atk int64\n", "Sp. Def int64\n", "Speed int64\n", "Generation int64\n", "Legendary bool\n", "dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dtypes" ] }, { "cell_type": "markdown", "id": "c03d10d1", "metadata": {}, "source": [ "Select all columns that are int or float:" ] }, { "cell_type": "code", "execution_count": 16, "id": "c503fa65", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.132111Z", "iopub.status.busy": "2025-09-02T16:31:20.131968Z", "iopub.status.idle": "2025-09-02T16:31:20.138208Z", "shell.execute_reply": "2025-09-02T16:31:20.137692Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#TotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration
013184549496565451
124056062638080601
23525808283100100801
3362580100123122120801
443093952436050651
\n", "
" ], "text/plain": [ " # Total HP Attack Defense Sp. Atk Sp. Def Speed Generation\n", "0 1 318 45 49 49 65 65 45 1\n", "1 2 405 60 62 63 80 80 60 1\n", "2 3 525 80 82 83 100 100 80 1\n", "3 3 625 80 100 123 122 120 80 1\n", "4 4 309 39 52 43 60 50 65 1" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.select_dtypes(include='number').head()" ] }, { "cell_type": "markdown", "id": "82e70419", "metadata": {}, "source": [ "Select multiple types by passing as a list:" ] }, { "cell_type": "code", "execution_count": 17, "id": "b1ec1a18", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.139661Z", "iopub.status.busy": "2025-09-02T16:31:20.139526Z", "iopub.status.idle": "2025-09-02T16:31:20.145592Z", "shell.execute_reply": "2025-09-02T16:31:20.145069Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration
01BulbasaurGrassPoison3184549496565451
12IvysaurGrassPoison4056062638080601
23VenusaurGrassPoison525808283100100801
33VenusaurMega VenusaurGrassPoison62580100123122120801
44CharmanderFireNaN3093952436050651
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation \n", "0 65 65 45 1 \n", "1 80 80 60 1 \n", "2 100 100 80 1 \n", "3 122 120 80 1 \n", "4 60 50 65 1 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.select_dtypes(include=['number', 'object']).head()" ] }, { "cell_type": "markdown", "id": "9d45c27e", "metadata": {}, "source": [ "Or exclude types by using `exclude` parameter:" ] }, { "cell_type": "code", "execution_count": 18, "id": "493861ef", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.146919Z", "iopub.status.busy": "2025-09-02T16:31:20.146786Z", "iopub.status.idle": "2025-09-02T16:31:20.151537Z", "shell.execute_reply": "2025-09-02T16:31:20.151059Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Legendary
0False
1False
2False
3False
4False
\n", "
" ], "text/plain": [ " Legendary\n", "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.select_dtypes(exclude=['number', 'object']).head()" ] }, { "cell_type": "markdown", "id": "91c78a48", "metadata": {}, "source": [ "## Convert strings to numbers" ] }, { "cell_type": "code", "execution_count": 19, "id": "25bbdcde", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.152946Z", "iopub.status.busy": "2025-09-02T16:31:20.152794Z", "iopub.status.idle": "2025-09-02T16:31:20.157786Z", "shell.execute_reply": "2025-09-02T16:31:20.157282Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "( col1 col2 col3\n", " 0 1.1 4.4 7.7\n", " 1 2.2 5.5 8.8\n", " 2 3.3 6.6 -,\n", " col1 object\n", " col2 object\n", " col3 object\n", " dtype: object)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'col1': ['1.1', '2.2', '3.3'], 'col2': ['4.4', '5.5', '6.6'], 'col3': ['7.7', '8.8', '-']})\n", "df, df.dtypes" ] }, { "cell_type": "markdown", "id": "3155542c", "metadata": {}, "source": [ "`df.astype()` can convert multiple columns at once. Use `errors='ignore'` to skip conversion errors." ] }, { "cell_type": "code", "execution_count": 20, "id": "79b95d85", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.159085Z", "iopub.status.busy": "2025-09-02T16:31:20.158955Z", "iopub.status.idle": "2025-09-02T16:31:20.164587Z", "shell.execute_reply": "2025-09-02T16:31:20.164023Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "col1 float64\n", "col2 float64\n", "col3 object\n", "dtype: object" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.astype({'col1': 'float', 'col2': 'float'}, errors='raise').dtypes" ] }, { "cell_type": "code", "execution_count": 21, "id": "5271f542", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.165888Z", "iopub.status.busy": "2025-09-02T16:31:20.165760Z", "iopub.status.idle": "2025-09-02T16:31:20.170566Z", "shell.execute_reply": "2025-09-02T16:31:20.170086Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "col1 float64\n", "col2 float64\n", "col3 object\n", "dtype: object" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.astype({'col1': 'float', 'col2': 'float', 'col3': 'float'}, errors='ignore').dtypes" ] }, { "cell_type": "markdown", "id": "aca91c6a", "metadata": {}, "source": [ "A better way to convert strings to numbers is to use `pd.to_numeric()` with `errors='coerce'` to convert invalid parsing to NaN." ] }, { "cell_type": "code", "execution_count": 22, "id": "2fdb2c63", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.171743Z", "iopub.status.busy": "2025-09-02T16:31:20.171609Z", "iopub.status.idle": "2025-09-02T16:31:20.176577Z", "shell.execute_reply": "2025-09-02T16:31:20.175912Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "0 7.7\n", "1 8.8\n", "2 NaN\n", "Name: col3, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_numeric(df.col3, errors='coerce')" ] }, { "cell_type": "code", "execution_count": 23, "id": "87272abe", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.177759Z", "iopub.status.busy": "2025-09-02T16:31:20.177639Z", "iopub.status.idle": "2025-09-02T16:31:20.183901Z", "shell.execute_reply": "2025-09-02T16:31:20.183235Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "0 7.7\n", "1 8.8\n", "2 0.0\n", "Name: col3, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_numeric(df.col3, errors='coerce').fillna(0)" ] }, { "cell_type": "code", "execution_count": 24, "id": "c25a8b95", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.185024Z", "iopub.status.busy": "2025-09-02T16:31:20.184899Z", "iopub.status.idle": "2025-09-02T16:31:20.191834Z", "shell.execute_reply": "2025-09-02T16:31:20.191342Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "data": { "text/plain": [ "( col1 col2 col3\n", " 0 1.1 4.4 7.7\n", " 1 2.2 5.5 8.8\n", " 2 3.3 6.6 0.0,\n", " col1 float64\n", " col2 float64\n", " col3 float64\n", " dtype: object)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.apply(pd.to_numeric, errors='coerce').fillna(0)\n", "df, df.dtypes" ] }, { "cell_type": "markdown", "id": "859415c8", "metadata": {}, "source": [ "## Deal with large datasets" ] }, { "cell_type": "markdown", "id": "11e594cf", "metadata": {}, "source": [ "### Check memory usage" ] }, { "cell_type": "code", "execution_count": 25, "id": "cf6c2c86", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.193072Z", "iopub.status.busy": "2025-09-02T16:31:20.192953Z", "iopub.status.idle": "2025-09-02T16:31:20.200292Z", "shell.execute_reply": "2025-09-02T16:31:20.199850Z" }, "tags": [ "scroll-output", "hide-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 # 800 non-null int64 \n", " 1 Name 800 non-null object\n", " 2 Type 1 800 non-null object\n", " 3 Type 2 414 non-null object\n", " 4 Total 800 non-null int64 \n", " 5 HP 800 non-null int64 \n", " 6 Attack 800 non-null int64 \n", " 7 Defense 800 non-null int64 \n", " 8 Sp. Atk 800 non-null int64 \n", " 9 Sp. Def 800 non-null int64 \n", " 10 Speed 800 non-null int64 \n", " 11 Generation 800 non-null int64 \n", " 12 Legendary 800 non-null bool \n", "dtypes: bool(1), int64(9), object(3)\n", "memory usage: 179.0 KB\n" ] } ], "source": [ "data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "b8806e0e", "metadata": {}, "source": [ "### Load specific columns" ] }, { "cell_type": "markdown", "id": "52ff03a8", "metadata": {}, "source": [ "We can load only the columns we need by using the `usecols` parameter of `pd.read_csv()`." ] }, { "cell_type": "code", "execution_count": 26, "id": "5a9eb528", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.201566Z", "iopub.status.busy": "2025-09-02T16:31:20.201432Z", "iopub.status.idle": "2025-09-02T16:31:20.207133Z", "shell.execute_reply": "2025-09-02T16:31:20.206709Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameType 1
0BulbasaurGrass
1IvysaurGrass
2VenusaurGrass
3VenusaurMega VenusaurGrass
4CharmanderFire
\n", "
" ], "text/plain": [ " Name Type 1\n", "0 Bulbasaur Grass\n", "1 Ivysaur Grass\n", "2 Venusaur Grass\n", "3 VenusaurMega Venusaur Grass\n", "4 Charmander Fire" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "small_data = pd.read_csv('data/Pokemon.csv', usecols=['Name', 'Type 1'])\n", "small_data.head()" ] }, { "cell_type": "markdown", "id": "d3e9c270", "metadata": {}, "source": [ "And it indeed saves memory:" ] }, { "cell_type": "code", "execution_count": 27, "id": "acddccbd", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.208422Z", "iopub.status.busy": "2025-09-02T16:31:20.208303Z", "iopub.status.idle": "2025-09-02T16:31:20.212717Z", "shell.execute_reply": "2025-09-02T16:31:20.212261Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 800 non-null object\n", " 1 Type 1 800 non-null object\n", "dtypes: object(2)\n", "memory usage: 87.8 KB\n" ] } ], "source": [ "small_data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "15bc55c4", "metadata": {}, "source": [ "If we know a column has only a few unique values, we can load it as `category` type to save memory:" ] }, { "cell_type": "code", "execution_count": 28, "id": "3a64d039", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.214064Z", "iopub.status.busy": "2025-09-02T16:31:20.213941Z", "iopub.status.idle": "2025-09-02T16:31:20.221774Z", "shell.execute_reply": "2025-09-02T16:31:20.221272Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 800 non-null object \n", " 1 Type 1 800 non-null category\n", "dtypes: category(1), object(1)\n", "memory usage: 47.7 KB\n" ] } ], "source": [ "smaller_data = pd.read_csv('data/Pokemon.csv', usecols=['Name', 'Type 1'], dtype={'Type 1': 'category'})\n", "smaller_data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "b0e4d360", "metadata": {}, "source": [ "### Load in chunks\n", "`next()` is used here to get the first chunk (a DataFrame with 100 rows) from the chunked CSV reader, so you can call `.head()` on it." ] }, { "cell_type": "code", "execution_count": 29, "id": "0642e955", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.223103Z", "iopub.status.busy": "2025-09-02T16:31:20.222983Z", "iopub.status.idle": "2025-09-02T16:31:20.230901Z", "shell.execute_reply": "2025-09-02T16:31:20.230412Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
..........................................
9588GrimerPoisonNaN3258080504050251False
9689MukPoisonNaN5001051057565100501False
9790ShellderWaterNaN30530651004525401False
9891CloysterWaterIce52550951808545701False
9992GastlyGhostPoison31030353010035801False
\n", "

100 rows × 13 columns

\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", ".. .. ... ... ... ... ... ... ... \n", "95 88 Grimer Poison NaN 325 80 80 50 \n", "96 89 Muk Poison NaN 500 105 105 75 \n", "97 90 Shellder Water NaN 305 30 65 100 \n", "98 91 Cloyster Water Ice 525 50 95 180 \n", "99 92 Gastly Ghost Poison 310 30 35 30 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False \n", ".. ... ... ... ... ... \n", "95 40 50 25 1 False \n", "96 65 100 50 1 False \n", "97 45 25 40 1 False \n", "98 85 45 70 1 False \n", "99 100 35 80 1 False \n", "\n", "[100 rows x 13 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunks_df = pd.read_csv('data/Pokemon.csv', chunksize=100)\n", "next(chunks_df) # first 100 rows" ] }, { "cell_type": "markdown", "id": "5812a0d4", "metadata": {}, "source": [ "We can save each chunk to separate data files for later usage:" ] }, { "cell_type": "code", "execution_count": 30, "id": "eb3ccf58", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.232368Z", "iopub.status.busy": "2025-09-02T16:31:20.232237Z", "iopub.status.idle": "2025-09-02T16:31:20.245405Z", "shell.execute_reply": "2025-09-02T16:31:20.244941Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "for i, df in enumerate(pd.read_csv('data/Pokemon.csv', chunksize=100)):\n", " df.to_csv(f'data/Pokemon_{i}.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 31, "id": "3bee9440", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.246821Z", "iopub.status.busy": "2025-09-02T16:31:20.246691Z", "iopub.status.idle": "2025-09-02T16:31:20.253218Z", "shell.execute_reply": "2025-09-02T16:31:20.252712Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df0 = pd.read_csv('data/Pokemon_0.csv')\n", "df0.head()" ] }, { "cell_type": "code", "execution_count": 32, "id": "19f5c79f", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.254407Z", "iopub.status.busy": "2025-09-02T16:31:20.254284Z", "iopub.status.idle": "2025-09-02T16:31:20.260914Z", "shell.execute_reply": "2025-09-02T16:31:20.260436Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
093HaunterGhostPoison40545504511555951False
194GengarGhostPoison500606560130751101False
294GengarMega GengarGhostPoison600606580170951301False
395OnixRockGround38535451603045701False
496DrowzeePsychicNaN3286048454390421False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 93 Haunter Ghost Poison 405 45 50 45 \n", "1 94 Gengar Ghost Poison 500 60 65 60 \n", "2 94 GengarMega Gengar Ghost Poison 600 60 65 80 \n", "3 95 Onix Rock Ground 385 35 45 160 \n", "4 96 Drowzee Psychic NaN 328 60 48 45 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 115 55 95 1 False \n", "1 130 75 110 1 False \n", "2 170 95 130 1 False \n", "3 30 45 70 1 False \n", "4 43 90 42 1 False " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.read_csv('data/Pokemon_1.csv')\n", "df1.head()" ] }, { "cell_type": "markdown", "id": "500bacd2", "metadata": {}, "source": [ "But how do we combine the data back? See [below](#build-a-dataframe-from-multiple-files-row-wise)" ] }, { "cell_type": "markdown", "id": "79a491b1", "metadata": {}, "source": [ "## Build a DataFrame from multiple files (row-wise)" ] }, { "cell_type": "markdown", "id": "3f73b02b", "metadata": {}, "source": [ "Suppose we want to join `Pokemon_0.csv` and `Pokemon_1.csv` row-wise:" ] }, { "cell_type": "markdown", "id": "51dac7f3", "metadata": {}, "source": [ "- Use `pd.concat` and pass in the list of dataframes we want to join" ] }, { "cell_type": "code", "execution_count": 33, "id": "b87db2ce", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.262255Z", "iopub.status.busy": "2025-09-02T16:31:20.262133Z", "iopub.status.idle": "2025-09-02T16:31:20.269945Z", "shell.execute_reply": "2025-09-02T16:31:20.269487Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df0 = pd.read_csv('data/Pokemon_0.csv')\n", "df1 = pd.read_csv('data/Pokemon_1.csv')\n", "df = pd.concat([df0, df1])\n", "df.head()" ] }, { "cell_type": "markdown", "id": "ac521f54", "metadata": {}, "source": [ "But here, we'll see some unexpected indices, as they are not consecutive (reset at the first row of 2nd dataframe):" ] }, { "cell_type": "code", "execution_count": 34, "id": "1d7f30b7", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.271307Z", "iopub.status.busy": "2025-09-02T16:31:20.271185Z", "iopub.status.idle": "2025-09-02T16:31:20.277157Z", "shell.execute_reply": "2025-09-02T16:31:20.276676Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
9588GrimerPoisonNaN3258080504050251False
9689MukPoisonNaN5001051057565100501False
9790ShellderWaterNaN30530651004525401False
9891CloysterWaterIce52550951808545701False
9992GastlyGhostPoison31030353010035801False
093HaunterGhostPoison40545504511555951False
194GengarGhostPoison500606560130751101False
294GengarMega GengarGhostPoison600606580170951301False
395OnixRockGround38535451603045701False
496DrowzeePsychicNaN3286048454390421False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "95 88 Grimer Poison NaN 325 80 80 50 \n", "96 89 Muk Poison NaN 500 105 105 75 \n", "97 90 Shellder Water NaN 305 30 65 100 \n", "98 91 Cloyster Water Ice 525 50 95 180 \n", "99 92 Gastly Ghost Poison 310 30 35 30 \n", "0 93 Haunter Ghost Poison 405 45 50 45 \n", "1 94 Gengar Ghost Poison 500 60 65 60 \n", "2 94 GengarMega Gengar Ghost Poison 600 60 65 80 \n", "3 95 Onix Rock Ground 385 35 45 160 \n", "4 96 Drowzee Psychic NaN 328 60 48 45 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "95 40 50 25 1 False \n", "96 65 100 50 1 False \n", "97 45 25 40 1 False \n", "98 85 45 70 1 False \n", "99 100 35 80 1 False \n", "0 115 55 95 1 False \n", "1 130 75 110 1 False \n", "2 170 95 130 1 False \n", "3 30 45 70 1 False \n", "4 43 90 42 1 False " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[95:105]" ] }, { "cell_type": "markdown", "id": "02b4a021", "metadata": {}, "source": [ "And, we just use `ignore_index=True` to workaround:" ] }, { "cell_type": "code", "execution_count": 35, "id": "7d11afc7", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.278514Z", "iopub.status.busy": "2025-09-02T16:31:20.278378Z", "iopub.status.idle": "2025-09-02T16:31:20.285435Z", "shell.execute_reply": "2025-09-02T16:31:20.284964Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
9588GrimerPoisonNaN3258080504050251False
9689MukPoisonNaN5001051057565100501False
9790ShellderWaterNaN30530651004525401False
9891CloysterWaterIce52550951808545701False
9992GastlyGhostPoison31030353010035801False
10093HaunterGhostPoison40545504511555951False
10194GengarGhostPoison500606560130751101False
10294GengarMega GengarGhostPoison600606580170951301False
10395OnixRockGround38535451603045701False
10496DrowzeePsychicNaN3286048454390421False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "95 88 Grimer Poison NaN 325 80 80 50 \n", "96 89 Muk Poison NaN 500 105 105 75 \n", "97 90 Shellder Water NaN 305 30 65 100 \n", "98 91 Cloyster Water Ice 525 50 95 180 \n", "99 92 Gastly Ghost Poison 310 30 35 30 \n", "100 93 Haunter Ghost Poison 405 45 50 45 \n", "101 94 Gengar Ghost Poison 500 60 65 60 \n", "102 94 GengarMega Gengar Ghost Poison 600 60 65 80 \n", "103 95 Onix Rock Ground 385 35 45 160 \n", "104 96 Drowzee Psychic NaN 328 60 48 45 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "95 40 50 25 1 False \n", "96 65 100 50 1 False \n", "97 45 25 40 1 False \n", "98 85 45 70 1 False \n", "99 100 35 80 1 False \n", "100 115 55 95 1 False \n", "101 130 75 110 1 False \n", "102 170 95 130 1 False \n", "103 30 45 70 1 False \n", "104 43 90 42 1 False " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.concat([df0, df1], ignore_index=True)\n", "df.iloc[95:105]" ] }, { "cell_type": "markdown", "id": "08ffbaab", "metadata": {}, "source": [ "## Build a DataFrame from multiple files (columns-wise)" ] }, { "cell_type": "code", "execution_count": 36, "id": "e7baae37", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.286572Z", "iopub.status.busy": "2025-09-02T16:31:20.286460Z", "iopub.status.idle": "2025-09-02T16:31:20.292810Z", "shell.execute_reply": "2025-09-02T16:31:20.292338Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "data.to_csv('data/Pokemon_first_6_cols.csv', columns=data.columns[:6], index=False)\n", "data.to_csv('data/Pokemon_second_7_cols.csv', columns=data.columns[6:], index=False)" ] }, { "cell_type": "code", "execution_count": 37, "id": "5df5b4ec", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.294044Z", "iopub.status.busy": "2025-09-02T16:31:20.293918Z", "iopub.status.idle": "2025-09-02T16:31:20.300575Z", "shell.execute_reply": "2025-09-02T16:31:20.300183Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHP
01BulbasaurGrassPoison31845
12IvysaurGrassPoison40560
23VenusaurGrassPoison52580
33VenusaurMega VenusaurGrassPoison62580
44CharmanderFireNaN30939
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP\n", "0 1 Bulbasaur Grass Poison 318 45\n", "1 2 Ivysaur Grass Poison 405 60\n", "2 3 Venusaur Grass Poison 525 80\n", "3 3 VenusaurMega Venusaur Grass Poison 625 80\n", "4 4 Charmander Fire NaN 309 39" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df0 = pd.read_csv('data/Pokemon_first_6_cols.csv')\n", "df0.head()" ] }, { "cell_type": "code", "execution_count": 38, "id": "58b1307d", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.301891Z", "iopub.status.busy": "2025-09-02T16:31:20.301769Z", "iopub.status.idle": "2025-09-02T16:31:20.307388Z", "shell.execute_reply": "2025-09-02T16:31:20.306971Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
049496565451False
162638080601False
28283100100801False
3100123122120801False
452436050651False
\n", "
" ], "text/plain": [ " Attack Defense Sp. Atk Sp. Def Speed Generation Legendary\n", "0 49 49 65 65 45 1 False\n", "1 62 63 80 80 60 1 False\n", "2 82 83 100 100 80 1 False\n", "3 100 123 122 120 80 1 False\n", "4 52 43 60 50 65 1 False" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.read_csv('data/Pokemon_second_7_cols.csv')\n", "df1.head()" ] }, { "cell_type": "markdown", "id": "ac97d1ee", "metadata": {}, "source": [ "And, combine them column-wise by using `axis='columns`:" ] }, { "cell_type": "code", "execution_count": 39, "id": "24a6226d", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.308594Z", "iopub.status.busy": "2025-09-02T16:31:20.308462Z", "iopub.status.idle": "2025-09-02T16:31:20.314348Z", "shell.execute_reply": "2025-09-02T16:31:20.313854Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.concat([df0, df1], axis='columns')\n", "df.head()" ] }, { "cell_type": "markdown", "id": "6ffe7b0f", "metadata": {}, "source": [ "## Split a DataFrame into 2 random subsets" ] }, { "cell_type": "markdown", "id": "1346712d", "metadata": {}, "source": [ "We sample 75% of our dataframe into `data_1`:" ] }, { "cell_type": "code", "execution_count": 40, "id": "4245486f", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.316102Z", "iopub.status.busy": "2025-09-02T16:31:20.315864Z", "iopub.status.idle": "2025-09-02T16:31:20.322133Z", "shell.execute_reply": "2025-09-02T16:31:20.321405Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "800" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('data/Pokemon.csv')\n", "len(data)" ] }, { "cell_type": "code", "execution_count": 41, "id": "be0c0abe", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.323546Z", "iopub.status.busy": "2025-09-02T16:31:20.323419Z", "iopub.status.idle": "2025-09-02T16:31:20.328017Z", "shell.execute_reply": "2025-09-02T16:31:20.327544Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "array([ 0, 2, 5, 6, 7, 8, 9, 11, 13, 16, 17, 19, 20,\n", " 21, 22, 23, 24, 25, 27, 28, 29, 31, 33, 34, 35, 36,\n", " 37, 38, 39, 40, 42, 43, 44, 47, 49, 50, 51, 52, 54,\n", " 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,\n", " 69, 70, 71, 72, 73, 74, 77, 78, 79, 80, 83, 85, 88,\n", " 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102,\n", " 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 118, 121,\n", " 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 137,\n", " 138, 140, 141, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153,\n", " 155, 156, 157, 159, 160, 161, 162, 163, 166, 167, 168, 169, 170,\n", " 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 184,\n", " 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 198, 199, 200,\n", " 201, 202, 203, 205, 206, 207, 213, 214, 215, 216, 217, 218, 219,\n", " 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,\n", " 234, 237, 238, 239, 240, 241, 242, 245, 247, 248, 249, 251, 252,\n", " 253, 254, 255, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,\n", " 270, 271, 272, 273, 274, 276, 277, 278, 280, 281, 282, 285, 287,\n", " 288, 289, 290, 296, 297, 298, 299, 300, 301, 302, 304, 305, 306,\n", " 307, 308, 309, 310, 315, 319, 320, 321, 322, 323, 324, 326, 327,\n", " 328, 329, 330, 331, 333, 335, 337, 338, 339, 341, 342, 344, 347,\n", " 348, 349, 351, 352, 353, 355, 356, 357, 358, 359, 360, 361, 362,\n", " 363, 364, 366, 367, 368, 369, 370, 373, 375, 376, 378, 380, 381,\n", " 382, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395,\n", " 397, 398, 399, 401, 402, 403, 405, 406, 407, 409, 410, 411, 412,\n", " 413, 414, 415, 416, 418, 419, 420, 421, 422, 423, 424, 426, 427,\n", " 428, 430, 432, 433, 435, 437, 438, 441, 443, 444, 445, 446, 449,\n", " 450, 451, 452, 453, 454, 456, 457, 458, 461, 462, 463, 464, 466,\n", " 467, 468, 470, 472, 473, 474, 475, 477, 478, 479, 480, 481, 483,\n", " 484, 485, 486, 488, 489, 491, 492, 494, 495, 496, 498, 499, 500,\n", " 501, 502, 503, 504, 505, 507, 508, 511, 512, 513, 515, 516, 517,\n", " 518, 519, 521, 522, 523, 524, 525, 526, 529, 530, 532, 533, 534,\n", " 535, 536, 537, 538, 539, 541, 542, 543, 544, 545, 548, 549, 552,\n", " 554, 555, 556, 557, 558, 560, 563, 564, 565, 566, 567, 569, 570,\n", " 571, 572, 575, 576, 578, 579, 581, 582, 583, 584, 585, 586, 587,\n", " 588, 589, 590, 592, 594, 598, 599, 602, 603, 604, 605, 606, 607,\n", " 609, 611, 612, 613, 614, 615, 616, 618, 620, 621, 622, 623, 624,\n", " 627, 628, 629, 630, 631, 632, 634, 635, 636, 637, 639, 641, 642,\n", " 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 656, 657,\n", " 658, 659, 660, 662, 665, 666, 667, 668, 669, 670, 672, 673, 674,\n", " 675, 676, 677, 678, 679, 680, 681, 682, 684, 688, 690, 691, 692,\n", " 693, 694, 696, 697, 698, 699, 700, 701, 703, 704, 705, 708, 709,\n", " 710, 711, 712, 713, 714, 715, 716, 718, 719, 720, 721, 722, 724,\n", " 725, 726, 727, 728, 729, 732, 733, 734, 735, 737, 738, 741, 742,\n", " 744, 746, 747, 749, 750, 753, 754, 755, 758, 759, 760, 761, 762,\n", " 764, 766, 767, 769, 770, 771, 772, 773, 774, 776, 777, 778, 780,\n", " 781, 782, 783, 785, 786, 787, 788, 789, 791, 793, 794, 796, 797,\n", " 798, 799])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_1 = data.sample(frac=0.75, random_state=1234)\n", "np.sort(data_1.index)" ] }, { "cell_type": "markdown", "id": "b318baf4", "metadata": {}, "source": [ "Get `data_2` by simple drop `data_1.index`:" ] }, { "cell_type": "code", "execution_count": 42, "id": "f58ee268", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.329574Z", "iopub.status.busy": "2025-09-02T16:31:20.329334Z", "iopub.status.idle": "2025-09-02T16:31:20.333708Z", "shell.execute_reply": "2025-09-02T16:31:20.333232Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "(200,\n", " array([ 1, 3, 4, 10, 12, 14, 15, 18, 26, 30, 32, 41, 45,\n", " 46, 48, 53, 68, 75, 76, 81, 82, 84, 86, 87, 98, 103,\n", " 107, 116, 117, 119, 120, 130, 135, 136, 139, 142, 150, 154, 158,\n", " 164, 165, 183, 195, 196, 197, 204, 208, 209, 210, 211, 212, 233,\n", " 235, 236, 243, 244, 246, 250, 256, 257, 258, 259, 275, 279, 283,\n", " 284, 286, 291, 292, 293, 294, 295, 303, 311, 312, 313, 314, 316,\n", " 317, 318, 325, 332, 334, 336, 340, 343, 345, 346, 350, 354, 365,\n", " 371, 372, 374, 377, 379, 383, 396, 400, 404, 408, 417, 425, 429,\n", " 431, 434, 436, 439, 440, 442, 447, 448, 455, 459, 460, 465, 469,\n", " 471, 476, 482, 487, 490, 493, 497, 506, 509, 510, 514, 520, 527,\n", " 528, 531, 540, 546, 547, 550, 551, 553, 559, 561, 562, 568, 573,\n", " 574, 577, 580, 591, 593, 595, 596, 597, 600, 601, 608, 610, 617,\n", " 619, 625, 626, 633, 638, 640, 654, 655, 661, 663, 664, 671, 683,\n", " 685, 686, 687, 689, 695, 702, 706, 707, 717, 723, 730, 731, 736,\n", " 739, 740, 743, 745, 748, 751, 752, 756, 757, 763, 765, 768, 775,\n", " 779, 784, 790, 792, 795]))" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_2 = data.drop(data_1.index)\n", "len(data_2), np.sort(data_2.index)" ] }, { "cell_type": "markdown", "id": "abbc06eb", "metadata": {}, "source": [ "Do a little check if the fraction was successful:" ] }, { "cell_type": "code", "execution_count": 43, "id": "96575bfe", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.335173Z", "iopub.status.busy": "2025-09-02T16:31:20.335040Z", "iopub.status.idle": "2025-09-02T16:31:20.338539Z", "shell.execute_reply": "2025-09-02T16:31:20.338072Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "800" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data_1) + len(data_2)" ] }, { "cell_type": "markdown", "id": "c28a7c1f", "metadata": {}, "source": [ "## Handle missing values" ] }, { "cell_type": "markdown", "id": "3a80731b", "metadata": {}, "source": [ "`DataFrame.isna()` return a same-sized object:" ] }, { "cell_type": "code", "execution_count": 44, "id": "eafb2311", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.339756Z", "iopub.status.busy": "2025-09-02T16:31:20.339604Z", "iopub.status.idle": "2025-09-02T16:31:20.346636Z", "shell.execute_reply": "2025-09-02T16:31:20.346085Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4FalseFalseFalseTrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense Sp. Atk \\\n", "0 False False False False False False False False False \n", "1 False False False False False False False False False \n", "2 False False False False False False False False False \n", "3 False False False False False False False False False \n", "4 False False False True False False False False False \n", "\n", " Sp. Def Speed Generation Legendary \n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "3 False False False False \n", "4 False False False False " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isna().head()" ] }, { "cell_type": "markdown", "id": "20e519bc", "metadata": {}, "source": [ "`isna().sum()` to check total missing values for each column:" ] }, { "cell_type": "code", "execution_count": 45, "id": "cc51601d", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.348035Z", "iopub.status.busy": "2025-09-02T16:31:20.347886Z", "iopub.status.idle": "2025-09-02T16:31:20.352794Z", "shell.execute_reply": "2025-09-02T16:31:20.352351Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "# 0\n", "Name 0\n", "Type 1 0\n", "Type 2 386\n", "Total 0\n", "HP 0\n", "Attack 0\n", "Defense 0\n", "Sp. Atk 0\n", "Sp. Def 0\n", "Speed 0\n", "Generation 0\n", "Legendary 0\n", "dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isna().sum()" ] }, { "cell_type": "markdown", "id": "096d5187", "metadata": {}, "source": [ "`isna().mean()` to check the proportion of missing values:" ] }, { "cell_type": "code", "execution_count": 46, "id": "ac0b3bc9", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.354386Z", "iopub.status.busy": "2025-09-02T16:31:20.354254Z", "iopub.status.idle": "2025-09-02T16:31:20.358853Z", "shell.execute_reply": "2025-09-02T16:31:20.358348Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "# 0.0000\n", "Name 0.0000\n", "Type 1 0.0000\n", "Type 2 0.4825\n", "Total 0.0000\n", "HP 0.0000\n", "Attack 0.0000\n", "Defense 0.0000\n", "Sp. Atk 0.0000\n", "Sp. Def 0.0000\n", "Speed 0.0000\n", "Generation 0.0000\n", "Legendary 0.0000\n", "dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isna().mean()" ] }, { "cell_type": "markdown", "id": "853f649a", "metadata": {}, "source": [ "We can choose to delete rows/columns that have missing values:" ] }, { "cell_type": "code", "execution_count": 47, "id": "6f1e3b48", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.360356Z", "iopub.status.busy": "2025-09-02T16:31:20.360212Z", "iopub.status.idle": "2025-09-02T16:31:20.366936Z", "shell.execute_reply": "2025-09-02T16:31:20.366407Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
66CharizardFireFlying534788478109851001False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "6 6 Charizard Fire Flying 534 78 84 78 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "6 109 85 100 1 False " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(axis='index').head() # Drop rows" ] }, { "cell_type": "markdown", "id": "bbd0188a", "metadata": {}, "source": [ "- We can use `thresh=` to keep the rows/columns that has at least `thresh` non-missing values:" ] }, { "cell_type": "code", "execution_count": 48, "id": "45b05041", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.368377Z", "iopub.status.busy": "2025-09-02T16:31:20.368246Z", "iopub.status.idle": "2025-09-02T16:31:20.375357Z", "shell.execute_reply": "2025-09-02T16:31:20.374873Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrass3184549496565451False
12IvysaurGrass4056062638080601False
23VenusaurGrass525808283100100801False
33VenusaurMega VenusaurGrass62580100123122120801False
44CharmanderFire3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Total HP Attack Defense Sp. Atk \\\n", "0 1 Bulbasaur Grass 318 45 49 49 65 \n", "1 2 Ivysaur Grass 405 60 62 63 80 \n", "2 3 Venusaur Grass 525 80 82 83 100 \n", "3 3 VenusaurMega Venusaur Grass 625 80 100 123 122 \n", "4 4 Charmander Fire 309 39 52 43 60 \n", "\n", " Sp. Def Speed Generation Legendary \n", "0 65 45 1 False \n", "1 80 60 1 False \n", "2 100 80 1 False \n", "3 120 80 1 False \n", "4 50 65 1 False " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(axis='columns', thresh=len(data)*0.6).head()" ] }, { "cell_type": "code", "execution_count": 49, "id": "d8fa2ff0", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.376668Z", "iopub.status.busy": "2025-09-02T16:31:20.376517Z", "iopub.status.idle": "2025-09-02T16:31:20.383422Z", "shell.execute_reply": "2025-09-02T16:31:20.382897Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(axis='columns', thresh=len(data)*0.4).head()" ] }, { "cell_type": "markdown", "id": "3f5ae986", "metadata": {}, "source": [ "`DataFrame.fillna()` to fill missing values:" ] }, { "cell_type": "code", "execution_count": 50, "id": "a430757d", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.384727Z", "iopub.status.busy": "2025-09-02T16:31:20.384597Z", "iopub.status.idle": "2025-09-02T16:31:20.392291Z", "shell.execute_reply": "2025-09-02T16:31:20.391612Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFirehehehehe3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire hehehehe 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.fillna('hehehehe').head()" ] }, { "cell_type": "markdown", "id": "1fc9366b", "metadata": {}, "source": [ "Or `DataFrame.interpolate()` to fill wrt surrounding values:" ] }, { "cell_type": "code", "execution_count": 51, "id": "14205c12", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.393710Z", "iopub.status.busy": "2025-09-02T16:31:20.393566Z", "iopub.status.idle": "2025-09-02T16:31:20.400748Z", "shell.execute_reply": "2025-09-02T16:31:20.400306Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_11297/3876796089.py:1: FutureWarning: DataFrame.interpolate with object dtype is deprecated and will raise in a future version. Call obj.infer_objects(copy=False) before interpolating instead.\n", " data.infer_objects(copy=False).interpolate().head()\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.infer_objects(copy=False).interpolate().head()" ] }, { "cell_type": "markdown", "id": "86c0a115", "metadata": {}, "source": [ "## Split a string into multiple columns:" ] }, { "cell_type": "markdown", "id": "3429394a", "metadata": {}, "source": [ "This is extremely useful when we have `Name`, and we want to split into `First`,`Middle`,`Last`:" ] }, { "cell_type": "code", "execution_count": 52, "id": "5f48264b", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.402105Z", "iopub.status.busy": "2025-09-02T16:31:20.401983Z", "iopub.status.idle": "2025-09-02T16:31:20.407414Z", "shell.execute_reply": "2025-09-02T16:31:20.406962Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelocation
0John Arthur DoeLos Angeles, CA
1Jane Ann SmithWashington, DC
\n", "
" ], "text/plain": [ " name location\n", "0 John Arthur Doe Los Angeles, CA\n", "1 Jane Ann Smith Washington, DC" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(dict(name=['John Arthur Doe', 'Jane Ann Smith'], location=['Los Angeles, CA', 'Washington, DC']))\n", "df" ] }, { "cell_type": "code", "execution_count": 53, "id": "a93de9ea", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.408636Z", "iopub.status.busy": "2025-09-02T16:31:20.408510Z", "iopub.status.idle": "2025-09-02T16:31:20.415895Z", "shell.execute_reply": "2025-09-02T16:31:20.415477Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelocationfirstmiddlelast
0John Arthur DoeLos Angeles, CAJohnArthurDoe
1Jane Ann SmithWashington, DCJaneAnnSmith
\n", "
" ], "text/plain": [ " name location first middle last\n", "0 John Arthur Doe Los Angeles, CA John Arthur Doe\n", "1 Jane Ann Smith Washington, DC Jane Ann Smith" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['first','middle','last']] = df.name.str.split(expand=True) # if not specify delimiters, split based on whitespace\n", "df" ] }, { "cell_type": "markdown", "id": "3209b1ad", "metadata": {}, "source": [ "## Reshape a MultiIndexed Series" ] }, { "cell_type": "code", "execution_count": 54, "id": "13960761", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.417235Z", "iopub.status.busy": "2025-09-02T16:31:20.417097Z", "iopub.status.idle": "2025-09-02T16:31:20.425741Z", "shell.execute_reply": "2025-09-02T16:31:20.425241Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "Type 1 Legendary\n", "Bug False 69\n", "Dark False 29\n", " True 2\n", "Dragon False 20\n", " True 12\n", "Electric False 40\n", " True 4\n", "Fairy False 16\n", " True 1\n", "Fighting False 27\n", "Fire False 47\n", " True 5\n", "Flying False 2\n", " True 2\n", "Ghost False 30\n", " True 2\n", "Grass False 67\n", " True 3\n", "Ground False 28\n", " True 4\n", "Ice False 22\n", " True 2\n", "Normal False 96\n", " True 2\n", "Poison False 28\n", "Psychic False 43\n", " True 14\n", "Rock False 40\n", " True 4\n", "Steel False 23\n", " True 4\n", "Water False 108\n", " True 4\n", "Name: #, dtype: int64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.groupby(['Type 1', 'Legendary'])['#'].count()" ] }, { "cell_type": "markdown", "id": "10a90bca", "metadata": {}, "source": [ "What if we want to convert the above Series into a DataFrame, with `Type 1` as rows, `Legendary` as columns, and the counts as values?\n", "- Use `unstack()` after `groupby()` and `count()`:" ] }, { "cell_type": "code", "execution_count": 55, "id": "b6f4338c", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.427465Z", "iopub.status.busy": "2025-09-02T16:31:20.427292Z", "iopub.status.idle": "2025-09-02T16:31:20.435846Z", "shell.execute_reply": "2025-09-02T16:31:20.435395Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LegendaryFalseTrue
Type 1
Bug69.0NaN
Dark29.02.0
Dragon20.012.0
Electric40.04.0
Fairy16.01.0
Fighting27.0NaN
Fire47.05.0
Flying2.02.0
Ghost30.02.0
Grass67.03.0
Ground28.04.0
Ice22.02.0
Normal96.02.0
Poison28.0NaN
Psychic43.014.0
Rock40.04.0
Steel23.04.0
Water108.04.0
\n", "
" ], "text/plain": [ "Legendary False True \n", "Type 1 \n", "Bug 69.0 NaN\n", "Dark 29.0 2.0\n", "Dragon 20.0 12.0\n", "Electric 40.0 4.0\n", "Fairy 16.0 1.0\n", "Fighting 27.0 NaN\n", "Fire 47.0 5.0\n", "Flying 2.0 2.0\n", "Ghost 30.0 2.0\n", "Grass 67.0 3.0\n", "Ground 28.0 4.0\n", "Ice 22.0 2.0\n", "Normal 96.0 2.0\n", "Poison 28.0 NaN\n", "Psychic 43.0 14.0\n", "Rock 40.0 4.0\n", "Steel 23.0 4.0\n", "Water 108.0 4.0" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.groupby(['Type 1', 'Legendary'])['#'].count().unstack()" ] }, { "cell_type": "markdown", "id": "cfc6bb0d", "metadata": {}, "source": [ "But there's a better way to do this by using `pivot_table()`, which can do more things like aggregation, filling missing values, and `margins`, which `unstack()` cannot do. See [below](#create-a-pivot-table)." ] }, { "cell_type": "markdown", "id": "5fe89c5f", "metadata": {}, "source": [ "## Create a pivot table" ] }, { "cell_type": "code", "execution_count": 56, "id": "4ba7b772", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.437284Z", "iopub.status.busy": "2025-09-02T16:31:20.437158Z", "iopub.status.idle": "2025-09-02T16:31:20.445443Z", "shell.execute_reply": "2025-09-02T16:31:20.444970Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LegendaryFalseTrue
Type 1
Bug70.971014NaN
Dark86.862069110.500000
Dragon103.400000126.666667
Electric66.12500098.750000
Fairy57.187500131.000000
Fighting96.777778NaN
Fire82.191489109.000000
Flying50.000000107.500000
Ghost71.366667110.000000
Grass72.11940397.666667
Ground88.000000150.000000
Ice73.22727367.500000
Normal72.083333140.000000
Poison74.678571NaN
Psychic54.953488122.142857
Rock89.925000122.250000
Steel92.08695796.250000
Water72.777778111.250000
\n", "
" ], "text/plain": [ "Legendary False True \n", "Type 1 \n", "Bug 70.971014 NaN\n", "Dark 86.862069 110.500000\n", "Dragon 103.400000 126.666667\n", "Electric 66.125000 98.750000\n", "Fairy 57.187500 131.000000\n", "Fighting 96.777778 NaN\n", "Fire 82.191489 109.000000\n", "Flying 50.000000 107.500000\n", "Ghost 71.366667 110.000000\n", "Grass 72.119403 97.666667\n", "Ground 88.000000 150.000000\n", "Ice 73.227273 67.500000\n", "Normal 72.083333 140.000000\n", "Poison 74.678571 NaN\n", "Psychic 54.953488 122.142857\n", "Rock 89.925000 122.250000\n", "Steel 92.086957 96.250000\n", "Water 72.777778 111.250000" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.pivot_table(index='Type 1', columns='Legendary', values='Attack', aggfunc='mean')" ] }, { "cell_type": "markdown", "id": "38bf43b7", "metadata": {}, "source": [ "The advantage of pivot table compared to using groupby + unstack is the `margins` parameter, which adds a row/column that gives the totals (or other aggregate functions) for each row/column:" ] }, { "cell_type": "code", "execution_count": 57, "id": "fbce2558", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.446751Z", "iopub.status.busy": "2025-09-02T16:31:20.446627Z", "iopub.status.idle": "2025-09-02T16:31:20.460944Z", "shell.execute_reply": "2025-09-02T16:31:20.460384Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LegendaryFalseTrueAll
Type 1
Bug70.971014NaN70.971014
Dark86.862069110.50000088.387097
Dragon103.400000126.666667112.125000
Electric66.12500098.75000069.090909
Fairy57.187500131.00000061.529412
Fighting96.777778NaN96.777778
Fire82.191489109.00000084.769231
Flying50.000000107.50000078.750000
Ghost71.366667110.00000073.781250
Grass72.11940397.66666773.214286
Ground88.000000150.00000095.750000
Ice73.22727367.50000072.750000
Normal72.083333140.00000073.469388
Poison74.678571NaN74.678571
Psychic54.953488122.14285771.456140
Rock89.925000122.25000092.863636
Steel92.08695796.25000092.703704
Water72.777778111.25000074.151786
All75.669388116.67692379.001250
\n", "
" ], "text/plain": [ "Legendary False True All\n", "Type 1 \n", "Bug 70.971014 NaN 70.971014\n", "Dark 86.862069 110.500000 88.387097\n", "Dragon 103.400000 126.666667 112.125000\n", "Electric 66.125000 98.750000 69.090909\n", "Fairy 57.187500 131.000000 61.529412\n", "Fighting 96.777778 NaN 96.777778\n", "Fire 82.191489 109.000000 84.769231\n", "Flying 50.000000 107.500000 78.750000\n", "Ghost 71.366667 110.000000 73.781250\n", "Grass 72.119403 97.666667 73.214286\n", "Ground 88.000000 150.000000 95.750000\n", "Ice 73.227273 67.500000 72.750000\n", "Normal 72.083333 140.000000 73.469388\n", "Poison 74.678571 NaN 74.678571\n", "Psychic 54.953488 122.142857 71.456140\n", "Rock 89.925000 122.250000 92.863636\n", "Steel 92.086957 96.250000 92.703704\n", "Water 72.777778 111.250000 74.151786\n", "All 75.669388 116.676923 79.001250" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.pivot_table(index='Type 1', columns='Legendary', values='Attack', aggfunc='mean', margins=True)" ] }, { "cell_type": "markdown", "id": "30e40fd5", "metadata": {}, "source": [ "## Reshape a DataFrame from wide format to long format" ] }, { "cell_type": "markdown", "id": "f35b6f15", "metadata": {}, "source": [ "What if we want 'HP' and 'Attack' to be the values of the column named 'Stats'?" ] }, { "cell_type": "code", "execution_count": 58, "id": "03988b23", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.462399Z", "iopub.status.busy": "2025-09-02T16:31:20.462267Z", "iopub.status.idle": "2025-09-02T16:31:20.466772Z", "shell.execute_reply": "2025-09-02T16:31:20.466316Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameHPAttack
0Bulbasaur4549
1Ivysaur6062
2Venusaur8082
3VenusaurMega Venusaur80100
4Charmander3952
\n", "
" ], "text/plain": [ " Name HP Attack\n", "0 Bulbasaur 45 49\n", "1 Ivysaur 60 62\n", "2 Venusaur 80 82\n", "3 VenusaurMega Venusaur 80 100\n", "4 Charmander 39 52" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = data[['Name', 'HP', 'Attack']].iloc[:5]\n", "f" ] }, { "cell_type": "code", "execution_count": 59, "id": "6f24c74c", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.468043Z", "iopub.status.busy": "2025-09-02T16:31:20.467911Z", "iopub.status.idle": "2025-09-02T16:31:20.475253Z", "shell.execute_reply": "2025-09-02T16:31:20.474548Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameStatsPoints
0BulbasaurHP45
1IvysaurHP60
2VenusaurHP80
3VenusaurMega VenusaurHP80
4CharmanderHP39
5BulbasaurAttack49
6IvysaurAttack62
7VenusaurAttack82
8VenusaurMega VenusaurAttack100
9CharmanderAttack52
\n", "
" ], "text/plain": [ " Name Stats Points\n", "0 Bulbasaur HP 45\n", "1 Ivysaur HP 60\n", "2 Venusaur HP 80\n", "3 VenusaurMega Venusaur HP 80\n", "4 Charmander HP 39\n", "5 Bulbasaur Attack 49\n", "6 Ivysaur Attack 62\n", "7 Venusaur Attack 82\n", "8 VenusaurMega Venusaur Attack 100\n", "9 Charmander Attack 52" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f.melt(id_vars='Name', var_name='Stats', value_name='Points')" ] }, { "cell_type": "markdown", "id": "de15445c", "metadata": {}, "source": [ "## Convert continuous data to categorical data" ] }, { "cell_type": "markdown", "id": "2fdd4030", "metadata": {}, "source": [ " What if we want `Attack` to be categorized (< 50: ‘weak’, 50-100: ‘normal’, 100-150: ‘strong’, >150: ‘nani?!’)" ] }, { "cell_type": "markdown", "id": "eb5ccf6b", "metadata": {}, "source": [ "Use `pd.cut(, , )` to convert continuous data to categorical data. Here, we convert 'Attack' into 4 categories: 'Weak', 'Normal', 'Strong', 'nani?!'." ] }, { "cell_type": "code", "execution_count": 60, "id": "59e4c2ee", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.476685Z", "iopub.status.busy": "2025-09-02T16:31:20.476548Z", "iopub.status.idle": "2025-09-02T16:31:20.480672Z", "shell.execute_reply": "2025-09-02T16:31:20.480121Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "df = data.copy()\n", "df['Attack'] = pd.cut(df['Attack'], bins=[0, 50, 100, 150, 200], labels=['Weak', 'Normal', 'Strong', 'nani?!'])" ] }, { "cell_type": "code", "execution_count": 61, "id": "797bec9b", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.482116Z", "iopub.status.busy": "2025-09-02T16:31:20.481932Z", "iopub.status.idle": "2025-09-02T16:31:20.490227Z", "shell.execute_reply": "2025-09-02T16:31:20.489720Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison31845Weak496565451False
12IvysaurGrassPoison40560Normal638080601False
23VenusaurGrassPoison52580Normal83100100801False
33VenusaurMega VenusaurGrassPoison62580Normal123122120801False
44CharmanderFireNaN30939Normal436050651False
..........................................
795719DiancieRockFairy60050Normal150100150506True
796719DiancieMega DiancieRockFairy70050nani?!1101601101106True
797720HoopaHoopa ConfinedPsychicGhost60080Strong60150130706True
798720HoopaHoopa UnboundPsychicDark68080nani?!60170130806True
799721VolcanionFireWater60080Strong12013090706True
\n", "

800 rows × 13 columns

\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 Weak 49 \n", "1 2 Ivysaur Grass Poison 405 60 Normal 63 \n", "2 3 Venusaur Grass Poison 525 80 Normal 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 Normal 123 \n", "4 4 Charmander Fire NaN 309 39 Normal 43 \n", ".. ... ... ... ... ... .. ... ... \n", "795 719 Diancie Rock Fairy 600 50 Normal 150 \n", "796 719 DiancieMega Diancie Rock Fairy 700 50 nani?! 110 \n", "797 720 HoopaHoopa Confined Psychic Ghost 600 80 Strong 60 \n", "798 720 HoopaHoopa Unbound Psychic Dark 680 80 nani?! 60 \n", "799 721 Volcanion Fire Water 600 80 Strong 120 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False \n", ".. ... ... ... ... ... \n", "795 100 150 50 6 True \n", "796 160 110 110 6 True \n", "797 150 130 70 6 True \n", "798 170 130 80 6 True \n", "799 130 90 70 6 True \n", "\n", "[800 rows x 13 columns]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "07e207f6", "metadata": {}, "source": [ "## Profile a DataFrame" ] }, { "cell_type": "code", "execution_count": 62, "id": "1f5789a6", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T16:31:20.491524Z", "iopub.status.busy": "2025-09-02T16:31:20.491400Z", "iopub.status.idle": "2025-09-02T16:31:30.734680Z", "shell.execute_reply": "2025-09-02T16:31:30.733822Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " Upgrade to ydata-sdk\n", "

\n", " Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.\n", "

\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "12eb6a9be6a44bcdae38bd7327767e1c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Summarize dataset: 0%| | 0/5 [00:00" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from ydata_profiling import ProfileReport\n", "\n", "report = ProfileReport(data)\n", "report.to_notebook_iframe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (code-venv)", "language": "python", "name": "code-venv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "0330f70d381e45318fc9ae454de84526": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_db48803a216e49629c0f1ccf75a43252", "IPY_MODEL_f4624adb0fde4b2c94a83b9b37763fee", "IPY_MODEL_783f400f12aa40d4b9961cb3d026bb47" ], "layout": "IPY_MODEL_0b648993245e4191b23bb445828650a6", "tabbable": null, "tooltip": null } }, "0b648993245e4191b23bb445828650a6": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0fc20d30ac154a53ac1456c6cd9278e3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_1240d58141374d82b6daf6683044f68f", "placeholder": "​", "style": "IPY_MODEL_69662b5878a64a719ed7e424e9e9e31b", "tabbable": null, "tooltip": null, "value": "Summarize dataset: 100%" } }, "1240d58141374d82b6daf6683044f68f": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "12eb6a9be6a44bcdae38bd7327767e1c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_0fc20d30ac154a53ac1456c6cd9278e3", "IPY_MODEL_f54e81c0d71d424abe714db77131adc9", "IPY_MODEL_fefb20db8c65484098975b9e20de222e" ], "layout": "IPY_MODEL_e9c4668a3b2c45acb4a3e6c686899e9b", "tabbable": null, "tooltip": null } }, "1e22e8e478df4a12b5b5abe2e2d879d4": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "25d37c6bc6954c02a7749fbbd8ceaaf2": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2992c1e92ce043ff9088556a08dc6512": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "2a976d94bb914baca9e7a703022c1417": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "2db9367ce39842ff8cce6dc649902d53": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "463a4a7f03ea4823a432cb7c3f59cdd2": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "4ceaa7ed9b9b4002ae646be2f146d143": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "5990b8c82787493f8037a251d1764d42": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "618c25e5c71e4054b5332c23568b721d": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "689155c2c43c4298b69ca40fd727341f": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "69662b5878a64a719ed7e424e9e9e31b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "6a1f4e19dd884a7cb857111cd0aec6a3": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "770703d0984f4e3bac65b6281c14165c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d1735190806e4d74a797ceb1123aa03d", "IPY_MODEL_ede132f426e745ac8aebee031dc893f5", "IPY_MODEL_aecd42052dd64b5a9825d625cdfb2e4d" ], "layout": "IPY_MODEL_689155c2c43c4298b69ca40fd727341f", "tabbable": null, "tooltip": null } }, "783f400f12aa40d4b9961cb3d026bb47": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_25d37c6bc6954c02a7749fbbd8ceaaf2", "placeholder": "​", "style": "IPY_MODEL_4ceaa7ed9b9b4002ae646be2f146d143", "tabbable": null, "tooltip": null, "value": " 1/1 [00:01<00:00,  1.47s/it]" } }, "809b74d1efde4147afae3eeb54576a2a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "aecd42052dd64b5a9825d625cdfb2e4d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_d81afc928a504789b8a1f0d52b2cfd67", "placeholder": "​", "style": "IPY_MODEL_c5b844a1756745319b427a5c497bce70", "tabbable": null, "tooltip": null, "value": " 1/1 [00:01<00:00,  1.32s/it]" } }, "ba5a6229f62948f081ff7688a7aebcb4": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c5b844a1756745319b427a5c497bce70": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "background": null, "description_width": "", "font_size": null, "text_color": null } }, "d1735190806e4d74a797ceb1123aa03d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_ba5a6229f62948f081ff7688a7aebcb4", "placeholder": "​", "style": "IPY_MODEL_809b74d1efde4147afae3eeb54576a2a", "tabbable": null, "tooltip": null, "value": "Render HTML: 100%" } }, "d81afc928a504789b8a1f0d52b2cfd67": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d9437da0b52e407aaf6d8546f0cdc477": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "db48803a216e49629c0f1ccf75a43252": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_1e22e8e478df4a12b5b5abe2e2d879d4", "placeholder": "​", "style": "IPY_MODEL_2a976d94bb914baca9e7a703022c1417", "tabbable": null, "tooltip": null, "value": "Generate report structure: 100%" } }, "e9c4668a3b2c45acb4a3e6c686899e9b": { "model_module": "@jupyter-widgets/base", "model_module_version": "2.0.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "2.0.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border_bottom": null, "border_left": null, "border_right": null, "border_top": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "eb30973d90974510a6b1869c8a0cb30b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "2.0.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "ede132f426e745ac8aebee031dc893f5": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_618c25e5c71e4054b5332c23568b721d", "max": 1.0, "min": 0.0, "orientation": "horizontal", "style": "IPY_MODEL_2992c1e92ce043ff9088556a08dc6512", "tabbable": null, "tooltip": null, "value": 1.0 } }, "f4624adb0fde4b2c94a83b9b37763fee": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_2db9367ce39842ff8cce6dc649902d53", "max": 1.0, "min": 0.0, "orientation": "horizontal", "style": "IPY_MODEL_eb30973d90974510a6b1869c8a0cb30b", "tabbable": null, "tooltip": null, "value": 1.0 } }, "f54e81c0d71d424abe714db77131adc9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_d9437da0b52e407aaf6d8546f0cdc477", "max": 5.0, "min": 0.0, "orientation": "horizontal", "style": "IPY_MODEL_6a1f4e19dd884a7cb857111cd0aec6a3", "tabbable": null, "tooltip": null, "value": 5.0 } }, "fefb20db8c65484098975b9e20de222e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "2.0.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "2.0.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "2.0.0", "_view_name": "HTMLView", "description": "", "description_allow_html": false, "layout": "IPY_MODEL_5990b8c82787493f8037a251d1764d42", "placeholder": "​", "style": "IPY_MODEL_463a4a7f03ea4823a432cb7c3f59cdd2", "tabbable": null, "tooltip": null, "value": " 103/103 [00:04<00:00, 17.46it/s, Completed]" } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }