Pandas simple EDA
The following is a simple script to perform a quick EDA of a Pandas dataframe. For a more complete analysis one can use one of the following libraries:
- dataprep: collect, explore and clean all in one tool.
- pandasprofiling: perfect for html reports.
- sweetviz: best for dataset comparison.
In the following code df
is the pandas dataframe to be analyzed and df_out
is the output eda dataframe.
import numpy as np
import pandas as pd
## df is the dataframe to analyze
df_c = df.convert_dtypes()
fullest_row = df_c.isna().mean(axis=1).argmin()
cat_threshold = 50
info_list = []
for column_name, s in df_c.items():
info_dict = {}
n_unique = s.nunique()
is_cat = n_unique < cat_threshold # check if is categorical data
is_num = pd.api.types.is_numeric_dtype(s.dtype)
is_int = pd.api.types.is_integer_dtype(s.dtype)
is_str = pd.api.types.is_string_dtype(s.dtype)
is_date = pd.api.types.is_datetime64_any_dtype(s.dtype)
info_dict['name'] = column_name
info_dict['nan percentage'] = s.isna().mean()
info_dict['value example'] = s[fullest_row]
info_dict['number of unique values'] = n_unique
if is_cat:
info_dict['data type'] = 'categorical'
if n_unique > 0:
info_dict['1st frequent'] = s.value_counts().index[0]
else:
info_dict['1st frequent'] = None
if n_unique > 1:
info_dict['2nd frequent'] = s.value_counts().index[1]
if is_num and not is_cat:
info_dict['data type'] = 'numerical'
info_dict['mean value'] = np.mean(s)
info_dict['10 percentile'] = s.quantile(0.1)
info_dict['50 percentile'] = s.quantile(0.5)
info_dict['90 percentile'] = s.quantile(0.9)
if is_str and not is_cat:
info_dict['data type'] = 'string'
info_dict['average number of character'] = np.mean(s.str.len())
if is_date and not is_cat:
info_dict['data type'] = 'date'
info_dict['min_date'] = s.min()
info_dict['max_date'] = s.max()
info_dict['tot_time'] = s.max()-s.min()
info_dict['avg_day_entry'] = len(s)/(s.max()-s.min()).days
info_list.append(info_dict)
df_out = pd.DataFrame(info_list)