from car_speech.fname_processing import load_fnames
from car_speech.pipeline import *

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models

Configuration

DATASET_TYPE = 'digits' # or 'letters'
label_strings = np.array([str(num) for num in range(0,10)])

# load classified filenames
filenames = load_fnames('noise_levels/digit_noise_levels/35D.data')

print('number of files:', len(filenames))

number of files: 1537

Pipeline (Audio to Spectrogram)

filenames = shuffle_data(filenames)

# Train/Validation/Test Split
split_result = train_test_split(filenames)
train_files = split_result[0]
val_files = split_result[1]
test_files = split_result[2]

# Process data using the combined pipeline
spectrogram_ds = preprocess_dataset(train_files, DATASET_TYPE)
train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files, DATASET_TYPE)
test_ds = preprocess_dataset(test_files, DATASET_TYPE)

print("Pipeline Completed")

Training set size: 1229
Validation set size: 153
Test set size: 155
Pipeline Completed

Train on training set

Split data into batches

batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

Reduce read latency during training

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

Model

for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(label_strings)

norm_layer = preprocessing.Normalization()
norm_layer.adapt(spectrogram_ds.map(lambda x, _: x))

model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32, 32), 
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

Input shape: (124, 129, 1)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
resizing (Resizing)          (None, 32, 32, 1)         0         
_________________________________________________________________
normalization (Normalization (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d (Conv2D)              (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1605760   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
=================================================================
Total params: 1,625,869
Trainable params: 1,625,866
Non-trainable params: 3
_________________________________________________________________

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

Train

EPOCHS = 25
history = model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/25
20/20 [==============================] - 2s 105ms/step - loss: 2.2106 - accuracy: 0.2075 - val_loss: 2.0273 - val_accuracy: 0.3137
Epoch 2/25
20/20 [==============================] - 1s 69ms/step - loss: 1.8210 - accuracy: 0.3686 - val_loss: 1.6323 - val_accuracy: 0.4837
Epoch 3/25
20/20 [==============================] - 1s 72ms/step - loss: 1.5195 - accuracy: 0.4980 - val_loss: 1.5472 - val_accuracy: 0.5163
Epoch 4/25
20/20 [==============================] - 1s 74ms/step - loss: 1.3168 - accuracy: 0.5590 - val_loss: 1.4113 - val_accuracy: 0.5621
Epoch 5/25
20/20 [==============================] - 1s 70ms/step - loss: 1.2360 - accuracy: 0.5932 - val_loss: 1.3241 - val_accuracy: 0.5882
Epoch 6/25
20/20 [==============================] - 1s 69ms/step - loss: 1.0837 - accuracy: 0.6314 - val_loss: 1.2757 - val_accuracy: 0.6209
Epoch 7/25
20/20 [==============================] - 1s 70ms/step - loss: 0.9810 - accuracy: 0.6664 - val_loss: 1.3389 - val_accuracy: 0.5948
Epoch 8/25
20/20 [==============================] - 1s 69ms/step - loss: 0.9021 - accuracy: 0.6721 - val_loss: 1.2648 - val_accuracy: 0.6209
Epoch 9/25
20/20 [==============================] - 1s 68ms/step - loss: 0.8267 - accuracy: 0.7168 - val_loss: 1.2477 - val_accuracy: 0.6275
Epoch 10/25
20/20 [==============================] - 1s 69ms/step - loss: 0.7798 - accuracy: 0.7266 - val_loss: 1.2782 - val_accuracy: 0.5948
Epoch 11/25
20/20 [==============================] - 1s 73ms/step - loss: 0.6977 - accuracy: 0.7575 - val_loss: 1.2571 - val_accuracy: 0.6536
Epoch 00011: early stopping

Save model

model_dir = 'models'
# Create a folder and save the model
model_name = 'model_single_digit' #Make sure you change this name. DO NOT OVERWRITE TRAINED MODELS.
# model.save(os.path.join(model_dir, model_name))

Plot loss

metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

Test

test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

Test set accuracy: 62%

Plot confusion matrix

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred) 
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=label_strings, yticklabels=label_strings, 
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

Training with Digits Data

Configuration

Pipeline (Audio to Spectrogram)

Train on training set

Test