Pandas Basics
import pandas as pd
import numpy as np
# create a list of labels
labels = ['a', 'b', 'c']
# create a list of numbers
mylist = [10, 20, 30]
# convert list to an np array
arr = np.array(mylist)
arr
array([10, 20, 30])
# create a dictionary using the lists
d = {'a':10, 'b':20, 'c':30}
d
{'a': 10, 'b': 20, 'c': 30}
# create a pandas series
pd.Series(data=mylist)
a 10
b 20
c 30
dtype: int64
# note the datatype when array is non-numeric
pd.Series(data=['a','b','c'])
0 a
1 b
2 c
dtype: object
# create a new series
ser1 = pd.Series([1, 2, 3, 4], index=['USA', 'Germany', 'USSR', 'Japan'])
ser1
USA 1
Germany 2
USSR 3
Japan 4
dtype: int64
# return location index
ser1['USA']
1
ser2 = pd.Series([1, 4, 5, 6], index=['USA', 'Germany', 'Italy', 'Japan'])
ser2
USA 1
Germany 4
Italy 5
Japan 6
dtype: int64
# adding the series
ser1 + ser2
Germany 6.0
Italy NaN
Japan 10.0
USA 2.0
USSR NaN
dtype: float64
# Dataframes
from numpy.random import randn
np.random.seed(101)
rand_matrix = randn(5,4)
rand_matrix
array([[ 2.70684984, 0.62813271, 0.90796945, 0.50382575],
[ 0.65111795, -0.31931804, -0.84807698, 0.60596535],
[-2.01816824, 0.74012206, 0.52881349, -0.58900053],
[ 0.18869531, -0.75887206, -0.93323722, 0.95505651],
[ 0.19079432, 1.97875732, 2.60596728, 0.68350889]])
# create the dataframe
df = pd.DataFrame(data=rand_matrix, index='A B C D E'.split(), columns=('W X Y Z').split())
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# slicing dataframes -- column
df['W']
A 2.706850
B 0.651118
C -2.018168
D 0.188695
E 0.190794
Name: W, dtype: float64
ind_list = ['W', 'X']
df[ind_list]
W | X | |
---|---|---|
A | 2.706850 | 0.628133 |
B | 0.651118 | -0.319318 |
C | -2.018168 | 0.740122 |
D | 0.188695 | -0.758872 |
E | 0.190794 | 1.978757 |
# or
df[['W', 'X']]
W | X | |
---|---|---|
A | 2.706850 | 0.628133 |
B | 0.651118 | -0.319318 |
C | -2.018168 | 0.740122 |
D | 0.188695 | -0.758872 |
E | 0.190794 | 1.978757 |
# creating a new column on the df by summing two columns
df['NEW'] = df['W'] + df['X']
df
W | X | Y | Z | NEW | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | 3.334983 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | 0.331800 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | -1.278046 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | -0.570177 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | 2.169552 |
# to delete a column- use .drop(axis=1) axis=0 is for rows
df.drop('NEW', axis=1)
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df
W | X | Y | Z | NEW | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | 3.334983 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | 0.331800 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | -1.278046 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | -0.570177 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | 2.169552 |
# the drop was not in-place as we can see 'NEW' is still there
# to delete permanently
df.drop('NEW', axis=1, inplace=True)
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# to drop rows simple use drop
df.drop('A')
df
W | X | Y | Z | |
---|---|---|---|---|
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# select rows
# we can use .loc
df.loc['A']
W 2.706850
X 0.628133
Y 0.907969
Z 0.503826
Name: A, dtype: float64
# or we can use index/integer location iloc
df.iloc[0]
W 2.706850
X 0.628133
Y 0.907969
Z 0.503826
Name: A, dtype: float64
# selecting multiple rows
df.loc[['A','B']]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
df.iloc[0:2]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
# to grab particular data points, lets say row AV and columns YZ
df.loc[['A','B']][['Y','Z']]
Y | Z | |
---|---|---|
A | 0.907969 | 0.503826 |
B | -0.848077 | 0.605965 |
# or
df.loc[['A','B'],['Y','Z']]
Y | Z | |
---|---|---|
A | 0.907969 | 0.503826 |
B | -0.848077 | 0.605965 |
# conditional selection
df > 0
W | X | Y | Z | |
---|---|---|---|---|
A | True | True | True | True |
B | True | False | False | True |
C | False | True | True | False |
D | True | False | False | True |
E | True | True | True | True |
# pass the df_bool to original df to get the results of the condition
df[df_bool]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | NaN | NaN | 0.605965 |
C | NaN | 0.740122 | 0.528813 | NaN |
D | 0.188695 | NaN | NaN | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# or
df[df > 0]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | NaN | NaN | 0.605965 |
C | NaN | 0.740122 | 0.528813 | NaN |
D | 0.188695 | NaN | NaN | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# to return the rows in a column with values greater than 0.5
df[df['W']>0.5]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
# lets get it from column 'Y'
df[df['W']> 0.5]['Y']
A 0.907969
B -0.848077
Name: Y, dtype: float64
#multiple conditions
df[(df['W'] > 0) & (df['Y'] > 1)]
W | X | Y | Z | |
---|---|---|---|---|
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# resetting the index-- turning the index into a new column name index
df.reset_index()
index | W | X | Y | Z | |
---|---|---|---|---|---|
0 | A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
1 | B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
2 | C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
3 | D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
4 | E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# changes are not permanent as inplace is defaulted to false
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# resetting the index
new_indx = 'CA NY WY OR CO'.split()
new_indx
['CA', 'NY', 'WY', 'OR', 'CO']
#create new column
df['States'] = new_indx
df
W | X | Y | Z | States | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | CA |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | NY |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | WY |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | OR |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | CO |
# set states as index -- inplace is always false
df.set_index('States')
W | X | Y | Z | |
---|---|---|---|---|
States | ||||
CA | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
NY | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
WY | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
OR | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
CO | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
# get df info
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
W 5 non-null float64
X 5 non-null float64
Y 5 non-null float64
Z 5 non-null float64
States 5 non-null object
dtypes: float64(4), object(1)
memory usage: 400.0+ bytes
#get df types
df.dtypes
W float64
X float64
Y float64
Z float64
States object
dtype: object
statistics
df.describe()
W | X | Y | Z | |
---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.343858 | 0.453764 | 0.452287 | 0.431871 |
std | 1.681131 | 1.061385 | 1.454516 | 0.594708 |
min | -2.018168 | -0.758872 | -0.933237 | -0.589001 |
25% | 0.188695 | -0.319318 | -0.848077 | 0.503826 |
50% | 0.190794 | 0.628133 | 0.528813 | 0.605965 |
75% | 0.651118 | 0.740122 | 0.907969 | 0.683509 |
max | 2.706850 | 1.978757 | 2.605967 | 0.955057 |