missingno#
Links: notebook
, html, PDF
, python
, slides, GitHub
missingno represents missing values in dataframe.
documentation source installation tutorial gallerie
from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
example#
Taken from NYPD-Motor-Vehicle-Collisions.
import pandas, os
if not os.path.exists("NYPD_Motor_Vehicle_Collisions_sample.csv"):
# the full file is 153 Mb
df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
df.sample(10000).to_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")
df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")
df.dtypes
Unnamed: 0 int64
DATE object
TIME object
BOROUGH object
ZIP CODE float64
LATITUDE float64
LONGITUDE float64
LOCATION object
ON STREET NAME object
CROSS STREET NAME object
OFF STREET NAME object
NUMBER OF PERSONS INJURED int64
NUMBER OF PERSONS KILLED int64
NUMBER OF PEDESTRIANS INJURED int64
NUMBER OF PEDESTRIANS KILLED int64
NUMBER OF CYCLIST INJURED int64
NUMBER OF CYCLIST KILLED int64
NUMBER OF MOTORIST INJURED int64
NUMBER OF MOTORIST KILLED int64
CONTRIBUTING FACTOR VEHICLE 1 object
CONTRIBUTING FACTOR VEHICLE 2 object
CONTRIBUTING FACTOR VEHICLE 3 object
CONTRIBUTING FACTOR VEHICLE 4 object
CONTRIBUTING FACTOR VEHICLE 5 object
UNIQUE KEY int64
VEHICLE TYPE CODE 1 object
VEHICLE TYPE CODE 2 object
VEHICLE TYPE CODE 3 object
VEHICLE TYPE CODE 4 object
VEHICLE TYPE CODE 5 object
dtype: object
sam = df.sample(250)
import missingno
missingno.matrix(sam)
try:
missingno.heatmap(sam)
except KeyError:
print("Maybe a mismatch between pandas and missingno.")
missingno.dendrogram(sam)
filtered_data = missingno.nullity_filter(sam, filter='bottom', n=15, p=0.999)
missingno.matrix(filtered_data)
sorted_data = missingno.nullity_sort(sam, sort='descending')
missingno.matrix(sorted_data.sample(250))