ML Snippets

Shuffling and splitting

from sklearn.utils import shuffle
x, y = shuffle(x, y, random_state=42)

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import numpy as np
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Balance a dataset

For training unbalanced dataset the siples solution is the reduce the size of the most populated classes to the size of the least populated:

import pandas as pd
import numpy as np

df = pd.DataFrame({'feature_1':np.random.choice([4,5], size=100),
                   'feature_2':np.random.random(size=100),
                   'target':np.random.choice([0,1,2], size=100)})

df_g = df.groupby(['target'])
df_bal = df_g.apply(lambda x: x.sample(df_g.size().min()).reset_index(drop=True))

Here’s a coutplot before and after the balancing:

import seaborn as sns
import matplotlib.pyplot as plt

fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(12,3))
sns.countplot(data=df, y='target', ax=ax0).set_title('Before')
sns.countplot(data=df_bal, y='target', ax=ax1).set_title('After')

balancing

Plot training history metrics for tensorflow models

def plot_history(model):
    history = model.history.history
    history_keys = [h for h in history.keys() if not 'val' in h]
    n_axis = len(history_keys)
    fig, axs = plt.subplots(1, n_axis, figsize=(n_axis*5, 3))
    for ax, k in zip(axs, history_keys):
        ax.plot(history[k], label=k)
        ax.plot(history['val_'+k], label='val_'+k)
        ax.set(xlim=(0, len(history[k])-1), xlabel='epochs')
        ax.legend()
        
## example usage given a tf model:
## plot_history(model)

metrics_histoty