{ "cells": [ { "cell_type": "markdown", "id": "6e0b53c0", "metadata": {}, "source": [ "# pandas: Deal with Large Datasets" ] }, { "cell_type": "markdown", "id": "6f10c6f6", "metadata": {}, "source": [ "## Package Import" ] }, { "cell_type": "code", "execution_count": 1, "id": "333fabdc", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.658982Z", "iopub.status.busy": "2025-09-02T18:59:13.658845Z", "iopub.status.idle": "2025-09-02T18:59:13.887783Z", "shell.execute_reply": "2025-09-02T18:59:13.886971Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "da6983a8", "metadata": {}, "source": [ "## Dataset Import" ] }, { "cell_type": "markdown", "id": "977644a9", "metadata": {}, "source": [ "The dataset used in this notebook is from [Kaggle - Pokemon](https://www.kaggle.com/datasets/abcsds/pokemon)." ] }, { "cell_type": "code", "execution_count": 2, "id": "e3e47473", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.889637Z", "iopub.status.busy": "2025-09-02T18:59:13.889477Z", "iopub.status.idle": "2025-09-02T18:59:13.893935Z", "shell.execute_reply": "2025-09-02T18:59:13.893456Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "data = pd.read_csv('data/Pokemon.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "325b680a", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.895312Z", "iopub.status.busy": "2025-09-02T18:59:13.895194Z", "iopub.status.idle": "2025-09-02T18:59:13.903642Z", "shell.execute_reply": "2025-09-02T18:59:13.903039Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
..........................................
795719DiancieRockFairy60050100150100150506True
796719DiancieMega DiancieRockFairy700501601101601101106True
797720HoopaHoopa ConfinedPsychicGhost6008011060150130706True
798720HoopaHoopa UnboundPsychicDark6808016060170130806True
799721VolcanionFireWater6008011012013090706True
\n", "

800 rows × 13 columns

\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", ".. ... ... ... ... ... .. ... ... \n", "795 719 Diancie Rock Fairy 600 50 100 150 \n", "796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", "797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", "798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", "799 721 Volcanion Fire Water 600 80 110 120 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False \n", ".. ... ... ... ... ... \n", "795 100 150 50 6 True \n", "796 160 110 110 6 True \n", "797 150 130 70 6 True \n", "798 170 130 80 6 True \n", "799 130 90 70 6 True \n", "\n", "[800 rows x 13 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "markdown", "id": "ce370a0d", "metadata": {}, "source": [ "## Deal with large datasets" ] }, { "cell_type": "markdown", "id": "438f261e", "metadata": {}, "source": [ "### Check memory usage" ] }, { "cell_type": "code", "execution_count": 4, "id": "97593be1", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.905032Z", "iopub.status.busy": "2025-09-02T18:59:13.904913Z", "iopub.status.idle": "2025-09-02T18:59:13.910935Z", "shell.execute_reply": "2025-09-02T18:59:13.910471Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 # 800 non-null int64 \n", " 1 Name 800 non-null object\n", " 2 Type 1 800 non-null object\n", " 3 Type 2 414 non-null object\n", " 4 Total 800 non-null int64 \n", " 5 HP 800 non-null int64 \n", " 6 Attack 800 non-null int64 \n", " 7 Defense 800 non-null int64 \n", " 8 Sp. Atk 800 non-null int64 \n", " 9 Sp. Def 800 non-null int64 \n", " 10 Speed 800 non-null int64 \n", " 11 Generation 800 non-null int64 \n", " 12 Legendary 800 non-null bool \n", "dtypes: bool(1), int64(9), object(3)\n", "memory usage: 179.0 KB\n" ] } ], "source": [ "data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "1605a40f", "metadata": {}, "source": [ "### Load specific columns" ] }, { "cell_type": "markdown", "id": "49fc08f5", "metadata": {}, "source": [ "We can load only the columns we need by using the `usecols` parameter of `pd.read_csv()`." ] }, { "cell_type": "code", "execution_count": 5, "id": "2db35d1b", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.912188Z", "iopub.status.busy": "2025-09-02T18:59:13.912076Z", "iopub.status.idle": "2025-09-02T18:59:13.916970Z", "shell.execute_reply": "2025-09-02T18:59:13.916610Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameType 1
0BulbasaurGrass
1IvysaurGrass
2VenusaurGrass
3VenusaurMega VenusaurGrass
4CharmanderFire
\n", "
" ], "text/plain": [ " Name Type 1\n", "0 Bulbasaur Grass\n", "1 Ivysaur Grass\n", "2 Venusaur Grass\n", "3 VenusaurMega Venusaur Grass\n", "4 Charmander Fire" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "small_data = pd.read_csv('data/Pokemon.csv', usecols=['Name', 'Type 1'])\n", "small_data.head()" ] }, { "cell_type": "markdown", "id": "71f1eb54", "metadata": {}, "source": [ "And it indeed saves memory:" ] }, { "cell_type": "code", "execution_count": 6, "id": "32424359", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.918281Z", "iopub.status.busy": "2025-09-02T18:59:13.918169Z", "iopub.status.idle": "2025-09-02T18:59:13.921826Z", "shell.execute_reply": "2025-09-02T18:59:13.921448Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 800 non-null object\n", " 1 Type 1 800 non-null object\n", "dtypes: object(2)\n", "memory usage: 87.8 KB\n" ] } ], "source": [ "small_data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "1d5bddb2", "metadata": {}, "source": [ "If we know a column has only a few unique values, we can load it as `category` type to save memory:" ] }, { "cell_type": "code", "execution_count": 7, "id": "e1465e0d", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.922976Z", "iopub.status.busy": "2025-09-02T18:59:13.922867Z", "iopub.status.idle": "2025-09-02T18:59:13.928213Z", "shell.execute_reply": "2025-09-02T18:59:13.927812Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 800 entries, 0 to 799\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 800 non-null object \n", " 1 Type 1 800 non-null category\n", "dtypes: category(1), object(1)\n", "memory usage: 47.7 KB\n" ] } ], "source": [ "smaller_data = pd.read_csv('data/Pokemon.csv', usecols=['Name', 'Type 1'], dtype={'Type 1': 'category'})\n", "smaller_data.info(memory_usage='deep')" ] }, { "cell_type": "markdown", "id": "fa1c436b", "metadata": {}, "source": [ "### Load in chunks\n", "`next()` is used here to get the first chunk (a DataFrame with 100 rows) from the chunked CSV reader, so you can call `.head()` on it." ] }, { "cell_type": "code", "execution_count": 8, "id": "42d85e19", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.929514Z", "iopub.status.busy": "2025-09-02T18:59:13.929399Z", "iopub.status.idle": "2025-09-02T18:59:13.935812Z", "shell.execute_reply": "2025-09-02T18:59:13.935422Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
..........................................
9588GrimerPoisonNaN3258080504050251False
9689MukPoisonNaN5001051057565100501False
9790ShellderWaterNaN30530651004525401False
9891CloysterWaterIce52550951808545701False
9992GastlyGhostPoison31030353010035801False
\n", "

100 rows × 13 columns

\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", ".. .. ... ... ... ... ... ... ... \n", "95 88 Grimer Poison NaN 325 80 80 50 \n", "96 89 Muk Poison NaN 500 105 105 75 \n", "97 90 Shellder Water NaN 305 30 65 100 \n", "98 91 Cloyster Water Ice 525 50 95 180 \n", "99 92 Gastly Ghost Poison 310 30 35 30 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False \n", ".. ... ... ... ... ... \n", "95 40 50 25 1 False \n", "96 65 100 50 1 False \n", "97 45 25 40 1 False \n", "98 85 45 70 1 False \n", "99 100 35 80 1 False \n", "\n", "[100 rows x 13 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunks_df = pd.read_csv('data/Pokemon.csv', chunksize=100)\n", "next(chunks_df) # first 100 rows" ] }, { "cell_type": "markdown", "id": "96d337b9", "metadata": {}, "source": [ "We can save each chunk to separate data files for later usage:" ] }, { "cell_type": "code", "execution_count": 9, "id": "5746c696", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.937168Z", "iopub.status.busy": "2025-09-02T18:59:13.937054Z", "iopub.status.idle": "2025-09-02T18:59:13.948339Z", "shell.execute_reply": "2025-09-02T18:59:13.947745Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "for i, df in enumerate(pd.read_csv('data/Pokemon.csv', chunksize=100)):\n", " df.to_csv(f'data/Pokemon_{i}.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 10, "id": "d8e1a6bf", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.949634Z", "iopub.status.busy": "2025-09-02T18:59:13.949516Z", "iopub.status.idle": "2025-09-02T18:59:13.955890Z", "shell.execute_reply": "2025-09-02T18:59:13.955496Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df0 = pd.read_csv('data/Pokemon_0.csv')\n", "df0.head()" ] }, { "cell_type": "code", "execution_count": 11, "id": "de5e413c", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:59:13.957258Z", "iopub.status.busy": "2025-09-02T18:59:13.957126Z", "iopub.status.idle": "2025-09-02T18:59:13.963357Z", "shell.execute_reply": "2025-09-02T18:59:13.962984Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
093HaunterGhostPoison40545504511555951False
194GengarGhostPoison500606560130751101False
294GengarMega GengarGhostPoison600606580170951301False
395OnixRockGround38535451603045701False
496DrowzeePsychicNaN3286048454390421False
\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 93 Haunter Ghost Poison 405 45 50 45 \n", "1 94 Gengar Ghost Poison 500 60 65 60 \n", "2 94 GengarMega Gengar Ghost Poison 600 60 65 80 \n", "3 95 Onix Rock Ground 385 35 45 160 \n", "4 96 Drowzee Psychic NaN 328 60 48 45 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 115 55 95 1 False \n", "1 130 75 110 1 False \n", "2 170 95 130 1 False \n", "3 30 45 70 1 False \n", "4 43 90 42 1 False " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.read_csv('data/Pokemon_1.csv')\n", "df1.head()" ] }, { "cell_type": "markdown", "id": "6d0a13e7", "metadata": {}, "source": [ "But how do we combine the data back? See [this article](https://cornel05.github.io/cornel.ai/notebooks/snippets/pandas/pandas_build_dataframe_multiple_files_row_wise.html) for more details." ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }