from car_speech.fname_processing import load_fnames
from car_speech.pipeline import *

import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models

Configuration

DATASET_TYPE = 'letters' # or 'letters'
label_strings = np.array(list(string.ascii_uppercase))

# load classified filenames
filenames = load_fnames('noise_levels/letter_noise_levels/IDL.data')

print('number of files:', len(filenames))

number of files: 2695

Pipeline (Audio to Spectrogram)

filenames = shuffle_data(filenames)

# Train/Validation/Test Split
split_result = train_test_split(filenames)
train_files = split_result[0]
val_files = split_result[1]
test_files = split_result[2]

# Process data using the combined pipeline
spectrogram_ds = preprocess_dataset(train_files, DATASET_TYPE)
train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files, DATASET_TYPE)
test_ds = preprocess_dataset(test_files, DATASET_TYPE)

print("Pipeline Completed")

Training set size: 2156
Validation set size: 269
Test set size: 270
Pipeline Completed

Train on training set

Split data into batches

batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

Reduce read latency during training

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

Model

for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(label_strings)

norm_layer = preprocessing.Normalization()
norm_layer.adapt(spectrogram_ds.map(lambda x, _: x))

model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32, 32), 
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

Input shape: (124, 129, 1)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
resizing_1 (Resizing)        (None, 32, 32, 1)         0         
_________________________________________________________________
normalization_1 (Normalizati (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 14, 14, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12544)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1605760   
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 26)                3354      
=================================================================
Total params: 1,627,933
Trainable params: 1,627,930
Non-trainable params: 3
_________________________________________________________________

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

Train

EPOCHS = 25
history = model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/25
34/34 [==============================] - 5s 135ms/step - loss: 3.1225 - accuracy: 0.0960 - val_loss: 2.8055 - val_accuracy: 0.1933
Epoch 2/25
34/34 [==============================] - 2s 73ms/step - loss: 2.7563 - accuracy: 0.2008 - val_loss: 2.4798 - val_accuracy: 0.3123
Epoch 3/25
34/34 [==============================] - 2s 70ms/step - loss: 2.4795 - accuracy: 0.2542 - val_loss: 2.2751 - val_accuracy: 0.3569
Epoch 4/25
34/34 [==============================] - 2s 68ms/step - loss: 2.2384 - accuracy: 0.3066 - val_loss: 2.0870 - val_accuracy: 0.3792
Epoch 5/25
34/34 [==============================] - 2s 68ms/step - loss: 2.0561 - accuracy: 0.3650 - val_loss: 1.9841 - val_accuracy: 0.4424
Epoch 6/25
34/34 [==============================] - 2s 69ms/step - loss: 1.8595 - accuracy: 0.4142 - val_loss: 1.9525 - val_accuracy: 0.4052
Epoch 7/25
34/34 [==============================] - 2s 69ms/step - loss: 1.7366 - accuracy: 0.4369 - val_loss: 1.8837 - val_accuracy: 0.4610
Epoch 8/25
34/34 [==============================] - 2s 69ms/step - loss: 1.6082 - accuracy: 0.4870 - val_loss: 1.8452 - val_accuracy: 0.4275
Epoch 9/25
34/34 [==============================] - 2s 70ms/step - loss: 1.4522 - accuracy: 0.5320 - val_loss: 1.8134 - val_accuracy: 0.4721
Epoch 10/25
34/34 [==============================] - 2s 69ms/step - loss: 1.3312 - accuracy: 0.5733 - val_loss: 1.7829 - val_accuracy: 0.4833
Epoch 11/25
34/34 [==============================] - 2s 69ms/step - loss: 1.2478 - accuracy: 0.5867 - val_loss: 1.8158 - val_accuracy: 0.4610
Epoch 12/25
34/34 [==============================] - 2s 69ms/step - loss: 1.1610 - accuracy: 0.6197 - val_loss: 1.8049 - val_accuracy: 0.5093
Epoch 00012: early stopping

Save model

model_dir = 'models'
# Create a folder and save the model
model_name = 'model_single_digit' #Make sure you change this name. DO NOT OVERWRITE TRAINED MODELS.
# model.save(os.path.join(model_dir, model_name))

Plot loss

metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

Test

test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

Test set accuracy: 43%

Plot confusion matrix

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred) 
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=label_strings, yticklabels=label_strings, 
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

Training with Letters Data

Configuration

Pipeline (Audio to Spectrogram)

Train on training set

Test