missingno

Links: notebook, html, PDF, python, slides, GitHub

missingno represents missing values in dataframe.

documentation source installation tutorial gallerie

from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline

example

Taken from NYPD-Motor-Vehicle-Collisions.

import pandas, os
if not os.path.exists("NYPD_Motor_Vehicle_Collisions_sample.csv"):
    # the full file is 153 Mb
    df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions.csv")
    df.sample(10000).to_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")

df = pandas.read_csv("NYPD_Motor_Vehicle_Collisions_sample.csv")
df.dtypes
Unnamed: 0                         int64
DATE                              object
TIME                              object
BOROUGH                           object
ZIP CODE                         float64
LATITUDE                         float64
LONGITUDE                        float64
LOCATION                          object
ON STREET NAME                    object
CROSS STREET NAME                 object
OFF STREET NAME                   object
NUMBER OF PERSONS INJURED          int64
NUMBER OF PERSONS KILLED           int64
NUMBER OF PEDESTRIANS INJURED      int64
NUMBER OF PEDESTRIANS KILLED       int64
NUMBER OF CYCLIST INJURED          int64
NUMBER OF CYCLIST KILLED           int64
NUMBER OF MOTORIST INJURED         int64
NUMBER OF MOTORIST KILLED          int64
CONTRIBUTING FACTOR VEHICLE 1     object
CONTRIBUTING FACTOR VEHICLE 2     object
CONTRIBUTING FACTOR VEHICLE 3     object
CONTRIBUTING FACTOR VEHICLE 4     object
CONTRIBUTING FACTOR VEHICLE 5     object
UNIQUE KEY                         int64
VEHICLE TYPE CODE 1               object
VEHICLE TYPE CODE 2               object
VEHICLE TYPE CODE 3               object
VEHICLE TYPE CODE 4               object
VEHICLE TYPE CODE 5               object
dtype: object
sam = df.sample(250)
import missingno
missingno.matrix(sam)
../_images/im_missingno_8_0.png
try:
    missingno.heatmap(sam)
except KeyError:
    print("Maybe a mismatch between pandas and missingno.")
../_images/im_missingno_9_0.png
missingno.dendrogram(sam)
../_images/im_missingno_10_0.png
filtered_data = missingno.nullity_filter(sam, filter='bottom', n=15, p=0.999)
missingno.matrix(filtered_data)
../_images/im_missingno_11_0.png
sorted_data = missingno.nullity_sort(sam, sort='descending')
missingno.matrix(sorted_data.sample(250))
../_images/im_missingno_12_0.png