{ "cells": [ { "cell_type": "markdown", "id": "e0bea234", "metadata": {}, "source": [ "# pandas: Convert Strings to Numbers" ] }, { "cell_type": "markdown", "id": "bf054109", "metadata": {}, "source": [ "## Package Import" ] }, { "cell_type": "code", "execution_count": 1, "id": "3a5c1752", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.010488Z", "iopub.status.busy": "2025-09-02T18:58:48.010352Z", "iopub.status.idle": "2025-09-02T18:58:48.229636Z", "shell.execute_reply": "2025-09-02T18:58:48.229088Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "bc0637ab", "metadata": {}, "source": [ "## Dataset Import" ] }, { "cell_type": "markdown", "id": "e68cca67", "metadata": {}, "source": [ "The dataset used in this notebook is from [Kaggle - Pokemon](https://www.kaggle.com/datasets/abcsds/pokemon)." ] }, { "cell_type": "code", "execution_count": 2, "id": "a4dbf275", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.231356Z", "iopub.status.busy": "2025-09-02T18:58:48.231203Z", "iopub.status.idle": "2025-09-02T18:58:48.236126Z", "shell.execute_reply": "2025-09-02T18:58:48.235528Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [], "source": [ "data = pd.read_csv('data/Pokemon.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "0325c887", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.237421Z", "iopub.status.busy": "2025-09-02T18:58:48.237310Z", "iopub.status.idle": "2025-09-02T18:58:48.246240Z", "shell.execute_reply": "2025-09-02T18:58:48.245496Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
..........................................
795719DiancieRockFairy60050100150100150506True
796719DiancieMega DiancieRockFairy700501601101601101106True
797720HoopaHoopa ConfinedPsychicGhost6008011060150130706True
798720HoopaHoopa UnboundPsychicDark6808016060170130806True
799721VolcanionFireWater6008011012013090706True
\n", "

800 rows × 13 columns

\n", "
" ], "text/plain": [ " # Name Type 1 Type 2 Total HP Attack Defense \\\n", "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", "1 2 Ivysaur Grass Poison 405 60 62 63 \n", "2 3 Venusaur Grass Poison 525 80 82 83 \n", "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", "4 4 Charmander Fire NaN 309 39 52 43 \n", ".. ... ... ... ... ... .. ... ... \n", "795 719 Diancie Rock Fairy 600 50 100 150 \n", "796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", "797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", "798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", "799 721 Volcanion Fire Water 600 80 110 120 \n", "\n", " Sp. Atk Sp. Def Speed Generation Legendary \n", "0 65 65 45 1 False \n", "1 80 80 60 1 False \n", "2 100 100 80 1 False \n", "3 122 120 80 1 False \n", "4 60 50 65 1 False \n", ".. ... ... ... ... ... \n", "795 100 150 50 6 True \n", "796 160 110 110 6 True \n", "797 150 130 70 6 True \n", "798 170 130 80 6 True \n", "799 130 90 70 6 True \n", "\n", "[800 rows x 13 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "markdown", "id": "068e2a03", "metadata": {}, "source": [ "## Convert strings to numbers" ] }, { "cell_type": "code", "execution_count": 4, "id": "7bd3e938", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.247513Z", "iopub.status.busy": "2025-09-02T18:58:48.247407Z", "iopub.status.idle": "2025-09-02T18:58:48.251181Z", "shell.execute_reply": "2025-09-02T18:58:48.250820Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "( col1 col2 col3\n", " 0 1.1 4.4 7.7\n", " 1 2.2 5.5 8.8\n", " 2 3.3 6.6 -,\n", " col1 object\n", " col2 object\n", " col3 object\n", " dtype: object)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'col1': ['1.1', '2.2', '3.3'], 'col2': ['4.4', '5.5', '6.6'], 'col3': ['7.7', '8.8', '-']})\n", "df, df.dtypes" ] }, { "cell_type": "markdown", "id": "77c3d5e7", "metadata": {}, "source": [ "`df.astype()` can convert multiple columns at once. Use `errors='ignore'` to skip conversion errors." ] }, { "cell_type": "code", "execution_count": 5, "id": "6da10df7", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.252382Z", "iopub.status.busy": "2025-09-02T18:58:48.252271Z", "iopub.status.idle": "2025-09-02T18:58:48.255940Z", "shell.execute_reply": "2025-09-02T18:58:48.255571Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "col1 float64\n", "col2 float64\n", "col3 object\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.astype({'col1': 'float', 'col2': 'float'}, errors='raise').dtypes" ] }, { "cell_type": "code", "execution_count": 6, "id": "1a419108", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.257273Z", "iopub.status.busy": "2025-09-02T18:58:48.257163Z", "iopub.status.idle": "2025-09-02T18:58:48.260663Z", "shell.execute_reply": "2025-09-02T18:58:48.260247Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "col1 float64\n", "col2 float64\n", "col3 object\n", "dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.astype({'col1': 'float', 'col2': 'float', 'col3': 'float'}, errors='ignore').dtypes" ] }, { "cell_type": "markdown", "id": "df1153b8", "metadata": {}, "source": [ "A better way to convert strings to numbers is to use `pd.to_numeric()` with `errors='coerce'` to convert invalid parsing to NaN." ] }, { "cell_type": "code", "execution_count": 7, "id": "210774df", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.262054Z", "iopub.status.busy": "2025-09-02T18:58:48.261948Z", "iopub.status.idle": "2025-09-02T18:58:48.265181Z", "shell.execute_reply": "2025-09-02T18:58:48.264807Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "0 7.7\n", "1 8.8\n", "2 NaN\n", "Name: col3, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_numeric(df.col3, errors='coerce')" ] }, { "cell_type": "code", "execution_count": 8, "id": "0c7e47ad", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.266476Z", "iopub.status.busy": "2025-09-02T18:58:48.266374Z", "iopub.status.idle": "2025-09-02T18:58:48.269495Z", "shell.execute_reply": "2025-09-02T18:58:48.269119Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "0 7.7\n", "1 8.8\n", "2 0.0\n", "Name: col3, dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_numeric(df.col3, errors='coerce').fillna(0)" ] }, { "cell_type": "code", "execution_count": 9, "id": "b382cfe0", "metadata": { "execution": { "iopub.execute_input": "2025-09-02T18:58:48.270779Z", "iopub.status.busy": "2025-09-02T18:58:48.270669Z", "iopub.status.idle": "2025-09-02T18:58:48.274627Z", "shell.execute_reply": "2025-09-02T18:58:48.274237Z" }, "tags": [ "hide-output", "scroll-output" ] }, "outputs": [ { "data": { "text/plain": [ "( col1 col2 col3\n", " 0 1.1 4.4 7.7\n", " 1 2.2 5.5 8.8\n", " 2 3.3 6.6 0.0,\n", " col1 float64\n", " col2 float64\n", " col3 float64\n", " dtype: object)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.apply(pd.to_numeric, errors='coerce').fillna(0)\n", "df, df.dtypes" ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }