Pandas Basics

5 minute read

import pandas as pd
import numpy as np

# create a list of labels
labels = ['a', 'b', 'c']

# create a list of numbers
mylist = [10, 20, 30]

# convert list to an np array
arr = np.array(mylist)
arr

array([10, 20, 30])

# create a dictionary using the lists
d = {'a':10, 'b':20, 'c':30}
d

{'a': 10, 'b': 20, 'c': 30}

# create a pandas series
pd.Series(data=mylist)

a    10
b    20
c    30
dtype: int64

# note the datatype when array is non-numeric
pd.Series(data=['a','b','c'])

0    a
1    b
2    c
dtype: object
# create a new series
ser1 = pd.Series([1, 2, 3, 4], index=['USA', 'Germany', 'USSR', 'Japan']) 
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

# return location index
ser1['USA']

1

ser2 = pd.Series([1, 4, 5, 6], index=['USA', 'Germany', 'Italy', 'Japan'])
ser2

USA        1
Germany    4
Italy      5
Japan      6
dtype: int64

# adding the series
ser1 + ser2

Germany     6.0
Italy       NaN
Japan      10.0
USA         2.0
USSR        NaN
dtype: float64

# Dataframes
from numpy.random import randn
np.random.seed(101)

rand_matrix = randn(5,4)
rand_matrix

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

# create the dataframe
df = pd.DataFrame(data=rand_matrix, index='A B C D E'.split(), columns=('W X Y Z').split())
df
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
# slicing dataframes -- column
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

ind_list = ['W', 'X']
df[ind_list]
W X
A 2.706850 0.628133
B 0.651118 -0.319318
C -2.018168 0.740122
D 0.188695 -0.758872
E 0.190794 1.978757
# or 
df[['W', 'X']]
W X
A 2.706850 0.628133
B 0.651118 -0.319318
C -2.018168 0.740122
D 0.188695 -0.758872
E 0.190794 1.978757
# creating a new column on the df by summing two columns
df['NEW'] = df['W'] + df['X']
df
W X Y Z NEW
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
# to delete a column- use .drop(axis=1) axis=0 is for rows
df.drop('NEW', axis=1)
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
df
W X Y Z NEW
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
# the drop was not in-place as we can see 'NEW' is still there
# to delete permanently
df.drop('NEW', axis=1, inplace=True)
df
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
# to drop rows simple use drop
df.drop('A')
df
W X Y Z
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509

# select rows
# we can use .loc
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

# or we can use index/integer location iloc
df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

# selecting multiple rows
df.loc[['A','B']]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965

df.iloc[0:2]

W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
# to grab particular data points, lets say row AV and columns YZ
df.loc[['A','B']][['Y','Z']]
Y Z
A 0.907969 0.503826
B -0.848077 0.605965
# or
df.loc[['A','B'],['Y','Z']]
Y Z
A 0.907969 0.503826
B -0.848077 0.605965
# conditional selection
df > 0
W X Y Z
A True True True True
B True False False True
C False True True False
D True False False True
E True True True True
# pass the df_bool to original df to get the results of the condition
df[df_bool]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 NaN NaN 0.605965
C NaN 0.740122 0.528813 NaN
D 0.188695 NaN NaN 0.955057
E 0.190794 1.978757 2.605967 0.683509
# or 
df[df > 0]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 NaN NaN 0.605965
C NaN 0.740122 0.528813 NaN
D 0.188695 NaN NaN 0.955057
E 0.190794 1.978757 2.605967 0.683509
# to return the rows in a column with values greater than 0.5
df[df['W']>0.5]
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
# lets get it from column 'Y'
df[df['W']> 0.5]['Y']

A    0.907969
B   -0.848077
Name: Y, dtype: float64
#multiple conditions
df[(df['W'] > 0) & (df['Y'] > 1)]
W X Y Z
E 0.190794 1.978757 2.605967 0.683509
# resetting the index-- turning the index into a new column name index
df.reset_index()
index W X Y Z
0 A 2.706850 0.628133 0.907969 0.503826
1 B 0.651118 -0.319318 -0.848077 0.605965
2 C -2.018168 0.740122 0.528813 -0.589001
3 D 0.188695 -0.758872 -0.933237 0.955057
4 E 0.190794 1.978757 2.605967 0.683509
# changes are not permanent as inplace is defaulted to false
df
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509

# resetting the index
new_indx = 'CA NY WY OR CO'.split()

new_indx
['CA', 'NY', 'WY', 'OR', 'CO']


#create new column
df['States'] = new_indx
df
W X Y Z States
A 2.706850 0.628133 0.907969 0.503826 CA
B 0.651118 -0.319318 -0.848077 0.605965 NY
C -2.018168 0.740122 0.528813 -0.589001 WY
D 0.188695 -0.758872 -0.933237 0.955057 OR
E 0.190794 1.978757 2.605967 0.683509 CO
# set states as index -- inplace is always false
df.set_index('States')
W X Y Z
States
CA 2.706850 0.628133 0.907969 0.503826
NY 0.651118 -0.319318 -0.848077 0.605965
WY -2.018168 0.740122 0.528813 -0.589001
OR 0.188695 -0.758872 -0.933237 0.955057
CO 0.190794 1.978757 2.605967 0.683509
# get df info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
W         5 non-null float64
X         5 non-null float64
Y         5 non-null float64
Z         5 non-null float64
States    5 non-null object
dtypes: float64(4), object(1)
memory usage: 400.0+ bytes

#get df types
df.dtypes

W         float64
X         float64
Y         float64
Z         float64
States     object
dtype: object

 statistics 
df.describe()
W X Y Z
count 5.000000 5.000000 5.000000 5.000000
mean 0.343858 0.453764 0.452287 0.431871
std 1.681131 1.061385 1.454516 0.594708
min -2.018168 -0.758872 -0.933237 -0.589001
25% 0.188695 -0.319318 -0.848077 0.503826
50% 0.190794 0.628133 0.528813 0.605965
75% 0.651118 0.740122 0.907969 0.683509
max 2.706850 1.978757 2.605967 0.955057

Tags:

Updated: