Pandas Basics

5 minute read

import pandas as pd
import numpy as np

# create a list of labels
labels = ['a', 'b', 'c']

# create a list of numbers
mylist = [10, 20, 30]

# convert list to an np array
arr = np.array(mylist)
arr

array([10, 20, 30])

# create a dictionary using the lists
d = {'a':10, 'b':20, 'c':30}
d

{'a': 10, 'b': 20, 'c': 30}

# create a pandas series
pd.Series(data=mylist)

a    10
b    20
c    30
dtype: int64

# note the datatype when array is non-numeric
pd.Series(data=['a','b','c'])

0    a
1    b
2    c
dtype: object

# create a new series
ser1 = pd.Series([1, 2, 3, 4], index=['USA', 'Germany', 'USSR', 'Japan']) 
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

# return location index
ser1['USA']

1

ser2 = pd.Series([1, 4, 5, 6], index=['USA', 'Germany', 'Italy', 'Japan'])
ser2

USA        1
Germany    4
Italy      5
Japan      6
dtype: int64

# adding the series
ser1 + ser2

Germany     6.0
Italy       NaN
Japan      10.0
USA         2.0
USSR        NaN
dtype: float64

# Dataframes
from numpy.random import randn
np.random.seed(101)

rand_matrix = randn(5,4)
rand_matrix

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

# create the dataframe
df = pd.DataFrame(data=rand_matrix, index='A B C D E'.split(), columns=('W X Y Z').split())
df

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

# slicing dataframes -- column
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

ind_list = ['W', 'X']
df[ind_list]

	W	X
A	2.706850	0.628133
B	0.651118	-0.319318
C	-2.018168	0.740122
D	0.188695	-0.758872
E	0.190794	1.978757

# or 
df[['W', 'X']]

	W	X
A	2.706850	0.628133
B	0.651118	-0.319318
C	-2.018168	0.740122
D	0.188695	-0.758872
E	0.190794	1.978757

# creating a new column on the df by summing two columns
df['NEW'] = df['W'] + df['X']
df

	W	X	Y	Z	NEW
A	2.706850	0.628133	0.907969	0.503826	3.334983
B	0.651118	-0.319318	-0.848077	0.605965	0.331800
C	-2.018168	0.740122	0.528813	-0.589001	-1.278046
D	0.188695	-0.758872	-0.933237	0.955057	-0.570177
E	0.190794	1.978757	2.605967	0.683509	2.169552

# to delete a column- use .drop(axis=1) axis=0 is for rows
df.drop('NEW', axis=1)

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

df

	W	X	Y	Z	NEW
A	2.706850	0.628133	0.907969	0.503826	3.334983
B	0.651118	-0.319318	-0.848077	0.605965	0.331800
C	-2.018168	0.740122	0.528813	-0.589001	-1.278046
D	0.188695	-0.758872	-0.933237	0.955057	-0.570177
E	0.190794	1.978757	2.605967	0.683509	2.169552

# the drop was not in-place as we can see 'NEW' is still there
# to delete permanently
df.drop('NEW', axis=1, inplace=True)
df

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

# to drop rows simple use drop
df.drop('A')
df

	W	X	Y	Z
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

# select rows
# we can use .loc
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

# or we can use index/integer location iloc
df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

# selecting multiple rows
df.loc[['A','B']]

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965

df.iloc[0:2]

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965

# to grab particular data points, lets say row AV and columns YZ
df.loc[['A','B']][['Y','Z']]

	Y	Z
A	0.907969	0.503826
B	-0.848077	0.605965

# or
df.loc[['A','B'],['Y','Z']]

	Y	Z
A	0.907969	0.503826
B	-0.848077	0.605965

# conditional selection
df > 0

	W	X	Y	Z
A	True	True	True	True
B	True	False	False	True
C	False	True	True	False
D	True	False	False	True
E	True	True	True	True

# pass the df_bool to original df to get the results of the condition
df[df_bool]

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	NaN	NaN	0.605965
C	NaN	0.740122	0.528813	NaN
D	0.188695	NaN	NaN	0.955057
E	0.190794	1.978757	2.605967	0.683509

# or 
df[df > 0]

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	NaN	NaN	0.605965
C	NaN	0.740122	0.528813	NaN
D	0.188695	NaN	NaN	0.955057
E	0.190794	1.978757	2.605967	0.683509

# to return the rows in a column with values greater than 0.5
df[df['W']>0.5]

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965

# lets get it from column 'Y'
df[df['W']> 0.5]['Y']

A    0.907969
B   -0.848077
Name: Y, dtype: float64

#multiple conditions
df[(df['W'] > 0) & (df['Y'] > 1)]

	W	X	Y	Z
E	0.190794	1.978757	2.605967	0.683509

# resetting the index-- turning the index into a new column name index
df.reset_index()

	index	W	X	Y	Z
0	A	2.706850	0.628133	0.907969	0.503826
1	B	0.651118	-0.319318	-0.848077	0.605965
2	C	-2.018168	0.740122	0.528813	-0.589001
3	D	0.188695	-0.758872	-0.933237	0.955057
4	E	0.190794	1.978757	2.605967	0.683509

# changes are not permanent as inplace is defaulted to false
df

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

# resetting the index
new_indx = 'CA NY WY OR CO'.split()

new_indx
['CA', 'NY', 'WY', 'OR', 'CO']


#create new column
df['States'] = new_indx
df

	W	X	Y	Z	States
A	2.706850	0.628133	0.907969	0.503826	CA
B	0.651118	-0.319318	-0.848077	0.605965	NY
C	-2.018168	0.740122	0.528813	-0.589001	WY
D	0.188695	-0.758872	-0.933237	0.955057	OR
E	0.190794	1.978757	2.605967	0.683509	CO

# set states as index -- inplace is always false
df.set_index('States')

	W	X	Y	Z
States
CA	2.706850	0.628133	0.907969	0.503826
NY	0.651118	-0.319318	-0.848077	0.605965
WY	-2.018168	0.740122	0.528813	-0.589001
OR	0.188695	-0.758872	-0.933237	0.955057
CO	0.190794	1.978757	2.605967	0.683509

# get df info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 5 columns):
W         5 non-null float64
X         5 non-null float64
Y         5 non-null float64
Z         5 non-null float64
States    5 non-null object
dtypes: float64(4), object(1)
memory usage: 400.0+ bytes

#get df types
df.dtypes

W         float64
X         float64
Y         float64
Z         float64
States     object
dtype: object

 statistics 
df.describe()

	W	X	Y	Z
count	5.000000	5.000000	5.000000	5.000000
mean	0.343858	0.453764	0.452287	0.431871
std	1.681131	1.061385	1.454516	0.594708
min	-2.018168	-0.758872	-0.933237	-0.589001
25%	0.188695	-0.319318	-0.848077	0.503826
50%	0.190794	0.628133	0.528813	0.605965
75%	0.651118	0.740122	0.907969	0.683509
max	2.706850	1.978757	2.605967	0.955057

Share on

Twitter Facebook Google+ LinkedIn

Gabe Maldonado

Pandas Basics

Share on

You may also enjoy

Deployed Web Apps

Mapping Geodata with Folium

Working with Timeseries Data II

Working with Timeseries Data