pandas: Split DataFrame into 2 Random Subsets#
Package Import#
import pandas as pd
import numpy as np
Dataset Import#
The dataset used in this notebook is from Kaggle - Pokemon.
data = pd.read_csv('data/Pokemon.csv')
data
Show code cell output
Hide code cell output
# | Name | Type 1 | Type 2 | Total | HP | Attack | Defense | Sp. Atk | Sp. Def | Speed | Generation | Legendary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Bulbasaur | Grass | Poison | 318 | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False |
1 | 2 | Ivysaur | Grass | Poison | 405 | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False |
2 | 3 | Venusaur | Grass | Poison | 525 | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False |
3 | 3 | VenusaurMega Venusaur | Grass | Poison | 625 | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False |
4 | 4 | Charmander | Fire | NaN | 309 | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
795 | 719 | Diancie | Rock | Fairy | 600 | 50 | 100 | 150 | 100 | 150 | 50 | 6 | True |
796 | 719 | DiancieMega Diancie | Rock | Fairy | 700 | 50 | 160 | 110 | 160 | 110 | 110 | 6 | True |
797 | 720 | HoopaHoopa Confined | Psychic | Ghost | 600 | 80 | 110 | 60 | 150 | 130 | 70 | 6 | True |
798 | 720 | HoopaHoopa Unbound | Psychic | Dark | 680 | 80 | 160 | 60 | 170 | 130 | 80 | 6 | True |
799 | 721 | Volcanion | Fire | Water | 600 | 80 | 110 | 120 | 130 | 90 | 70 | 6 | True |
800 rows × 13 columns
Split a DataFrame into 2 random subsets#
We sample 75% of our dataframe into data_1
:
data = pd.read_csv('data/Pokemon.csv')
len(data)
Show code cell output
Hide code cell output
800
data_1 = data.sample(frac=0.75, random_state=1234)
np.sort(data_1.index)
Show code cell output
Hide code cell output
array([ 0, 2, 5, 6, 7, 8, 9, 11, 13, 16, 17, 19, 20,
21, 22, 23, 24, 25, 27, 28, 29, 31, 33, 34, 35, 36,
37, 38, 39, 40, 42, 43, 44, 47, 49, 50, 51, 52, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
69, 70, 71, 72, 73, 74, 77, 78, 79, 80, 83, 85, 88,
89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102,
104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 118, 121,
122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 137,
138, 140, 141, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153,
155, 156, 157, 159, 160, 161, 162, 163, 166, 167, 168, 169, 170,
171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 184,
185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 198, 199, 200,
201, 202, 203, 205, 206, 207, 213, 214, 215, 216, 217, 218, 219,
220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
234, 237, 238, 239, 240, 241, 242, 245, 247, 248, 249, 251, 252,
253, 254, 255, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
270, 271, 272, 273, 274, 276, 277, 278, 280, 281, 282, 285, 287,
288, 289, 290, 296, 297, 298, 299, 300, 301, 302, 304, 305, 306,
307, 308, 309, 310, 315, 319, 320, 321, 322, 323, 324, 326, 327,
328, 329, 330, 331, 333, 335, 337, 338, 339, 341, 342, 344, 347,
348, 349, 351, 352, 353, 355, 356, 357, 358, 359, 360, 361, 362,
363, 364, 366, 367, 368, 369, 370, 373, 375, 376, 378, 380, 381,
382, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395,
397, 398, 399, 401, 402, 403, 405, 406, 407, 409, 410, 411, 412,
413, 414, 415, 416, 418, 419, 420, 421, 422, 423, 424, 426, 427,
428, 430, 432, 433, 435, 437, 438, 441, 443, 444, 445, 446, 449,
450, 451, 452, 453, 454, 456, 457, 458, 461, 462, 463, 464, 466,
467, 468, 470, 472, 473, 474, 475, 477, 478, 479, 480, 481, 483,
484, 485, 486, 488, 489, 491, 492, 494, 495, 496, 498, 499, 500,
501, 502, 503, 504, 505, 507, 508, 511, 512, 513, 515, 516, 517,
518, 519, 521, 522, 523, 524, 525, 526, 529, 530, 532, 533, 534,
535, 536, 537, 538, 539, 541, 542, 543, 544, 545, 548, 549, 552,
554, 555, 556, 557, 558, 560, 563, 564, 565, 566, 567, 569, 570,
571, 572, 575, 576, 578, 579, 581, 582, 583, 584, 585, 586, 587,
588, 589, 590, 592, 594, 598, 599, 602, 603, 604, 605, 606, 607,
609, 611, 612, 613, 614, 615, 616, 618, 620, 621, 622, 623, 624,
627, 628, 629, 630, 631, 632, 634, 635, 636, 637, 639, 641, 642,
643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 656, 657,
658, 659, 660, 662, 665, 666, 667, 668, 669, 670, 672, 673, 674,
675, 676, 677, 678, 679, 680, 681, 682, 684, 688, 690, 691, 692,
693, 694, 696, 697, 698, 699, 700, 701, 703, 704, 705, 708, 709,
710, 711, 712, 713, 714, 715, 716, 718, 719, 720, 721, 722, 724,
725, 726, 727, 728, 729, 732, 733, 734, 735, 737, 738, 741, 742,
744, 746, 747, 749, 750, 753, 754, 755, 758, 759, 760, 761, 762,
764, 766, 767, 769, 770, 771, 772, 773, 774, 776, 777, 778, 780,
781, 782, 783, 785, 786, 787, 788, 789, 791, 793, 794, 796, 797,
798, 799])
Get data_2
by simple drop data_1.index
:
data_2 = data.drop(data_1.index)
len(data_2), np.sort(data_2.index)
Show code cell output
Hide code cell output
(200,
array([ 1, 3, 4, 10, 12, 14, 15, 18, 26, 30, 32, 41, 45,
46, 48, 53, 68, 75, 76, 81, 82, 84, 86, 87, 98, 103,
107, 116, 117, 119, 120, 130, 135, 136, 139, 142, 150, 154, 158,
164, 165, 183, 195, 196, 197, 204, 208, 209, 210, 211, 212, 233,
235, 236, 243, 244, 246, 250, 256, 257, 258, 259, 275, 279, 283,
284, 286, 291, 292, 293, 294, 295, 303, 311, 312, 313, 314, 316,
317, 318, 325, 332, 334, 336, 340, 343, 345, 346, 350, 354, 365,
371, 372, 374, 377, 379, 383, 396, 400, 404, 408, 417, 425, 429,
431, 434, 436, 439, 440, 442, 447, 448, 455, 459, 460, 465, 469,
471, 476, 482, 487, 490, 493, 497, 506, 509, 510, 514, 520, 527,
528, 531, 540, 546, 547, 550, 551, 553, 559, 561, 562, 568, 573,
574, 577, 580, 591, 593, 595, 596, 597, 600, 601, 608, 610, 617,
619, 625, 626, 633, 638, 640, 654, 655, 661, 663, 664, 671, 683,
685, 686, 687, 689, 695, 702, 706, 707, 717, 723, 730, 731, 736,
739, 740, 743, 745, 748, 751, 752, 756, 757, 763, 765, 768, 775,
779, 784, 790, 792, 795]))
Do a little check if the fraction was successful:
len(data_1) + len(data_2)
Show code cell output
Hide code cell output
800