Dataset link for downloading:

From original source:
- Digits:
  http://www.isle.illinois.edu/speech_web_lg/data/avicar_somedigits.zip
- Letters:
  http://www.isle.illinois.edu/speech_web_lg/data/avicar_someletters.zip
From Google Drive:
- Digits:
  https://drive.google.com/file/d/1SJlrT6kZrhtAmABcO63nVPYrACWMnthD/view?usp=sharing
- Letters:
  https://drive.google.com/file/d/1TyEMlvPGVyR0xzsdmloMJF8XhAhCbiVe/view?usp=sharing

The digits and letters dataset only include some recordings of single digits and letters.

1. Setup

import os
import pathlib
import glob
import string

import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models

from IPython import display

# Use seed 66 for consistant experiments
seed = 66
tf.random.set_seed(seed)
np.random.seed(seed)

Choose a dataset. Choices: digits, letters, mixed

choices = ['digits', 'letters', 'mixed']
dataset_type = choices[0]

2. Import dataset

Download data manually from Google Drive if the website of the original source is down. Place the downloaded folder under 'data' directory.

download_from = 'origin'

data_dir = pathlib.Path('data/avicar_some'+dataset_type)

def download_data(data_dir, dataset_type, download_from):
    if download_from == 'origin':
        file_name = 'avicar_some' + dataset_type + '.zip'
        download_link = "http://www.isle.illinois.edu/speech_web_lg/data/avicar_some" + dataset_type + ".zip"
#     elif download_from == 'drive':
#         file_name = 'avicar_some' + dataset_type + '.zip'
#         if dataset_type == 'digits':
#             dowload_link = 'https://drive.google.com/file/d/1SJlrT6kZrhtAmABcO63nVPYrACWMnthD/view?usp=sharing'
#         elif dataset_type == 'letters':
#             download_link = 'https://drive.google.com/file/d/1TyEMlvPGVyR0xzsdmloMJF8XhAhCbiVe/view?usp=sharing'
    
    if not data_dir.exists():
        tf.keras.utils.get_file(
            file_name,
            origin= download_link,
            extract=True,
            cache_dir='.', cache_subdir='data')

download_data(data_dir, dataset_type, download_from)

Get label names.

if dataset_type == 'digits':
    label_strings = np.array([str(num) for num in range(0,10)])
elif dataset_type == 'letters':
    label_strings = np.array(list(string.ascii_lowercase))
else:
    label_strings = np.array([str(num) for num in range(0,10)] + list(string.ascii_lowercase))
    
print(dataset_type + ":", label_strings)

digits: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']

Extract filenames into a list.

Some files are probably put in the folder by mistake, so we need to filter out some files that have the incorrect labels. We check one of the indices in the file name. The string at that index indicates the lable of the file.

filenames = tf.io.gfile.glob(str(data_dir)+"*/*.wav")
print("Example filename:", filenames[99])
print()

if dataset_type == "digits":
    # Filter out non-digit files
    filenames = [fname for fname in filenames if fname[32].isdigit()]
    
    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[32]==label_strings[i]])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")
    
elif dataset_type == 'letters':
    # Filter out non-letter files
    filenames = [fname for fname in filenames if not(fname[32].isdigit())]

    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[33]==label_strings[i].upper()])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")

num_samples = len(filenames)
print('# total examples:', num_samples)
print()

Example filename: data/avicar_somedigits/AM3_35U_D0_C1_M3.wav

# examples for "0": 772
# examples for "1": 771
# examples for "2": 771
# examples for "3": 771
# examples for "4": 770
# examples for "5": 771
# examples for "6": 771
# examples for "7": 770
# examples for "8": 769
# examples for "9": 772
# total examples: 7708

Shuffle files

filenames = tf.random.shuffle(filenames)

print('Example file tensor:', filenames[0])

Example file tensor: tf.Tensor(b'data/avicar_somedigits/GM2_55U_D2_C1_M3.wav', shape=(), dtype=string)

Train/Validation/Test Split using 80:10:10 ratio

TRAIN_PORTION = 0.8
VAL_PORTION = 0.1
TEST_PORTION = 0.1

train_end = int(num_samples*TRAIN_PORTION)
val_end = train_end + int(num_samples*VAL_PORTION)

train_files = filenames[:train_end]
val_files = filenames[train_end: val_end]
test_files = filenames[val_end:]

print('Training set size:', len(train_files))
print('Validation set size:', len(val_files))
print('Test set size:', len(test_files))

Training set size: 6166
Validation set size: 770
Test set size: 772

3. Read audio files and their labels

def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    # be careful with data type here
    # this function must return a tensor
    label_tensor = tf.strings.substr(parts[-1], pos=9, len=1)
    return label_tensor

# print(get_label(train_files[1]))

def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
files_ds = tf.data.Dataset.from_tensor_slices(train_files)
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)

Display some waveforms randomly picked from our dataset

rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 12))
for i, (audio, label) in enumerate(waveform_ds.take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    ax.plot(audio.numpy())
    ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    label = label.numpy().decode('utf-8')
    ax.set_title(label)

plt.show()

4. Spectrogram

Look at waveform, spectrogram, and the audio of 1 example.

Note: Before converting wavefrom to spectrogram, we must pad or cut the length to 1 second.

def get_spectrogram(waveform):
    diff = [16000] - tf.shape(waveform)
    
    waveform = tf.cast(waveform, tf.float32)
    
    if diff >= 0:
        # Padding for files with less than 16000 samples
        zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)
        # Concatenate audio with padding so that all audio clips will be of the same length
        equal_length = tf.concat([waveform, zero_padding], 0)
    else:
        # Cut the tail if audio > 1 second
        equal_length = tf.slice(waveform, [0], [16000])
        
    spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

    spectrogram = tf.abs(spectrogram)

    return spectrogram

for waveform, label in waveform_ds.take(1):
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)
print('Audio playback')
display.display(display.Audio(waveform, rate=16000))

Label: 2
Waveform shape: (15477,)
Spectrogram shape: (124, 129)
Audio playback

Plot the waveform and spectrogram

def plot_spectrogram(spectrogram, ax):
    # Convert to frequencies to log scale and transpose so that the time is
    # represented in the x-axis (columns).
    log_spec = np.log(spectrogram.T)
    height = log_spec.shape[0]
    X = np.arange(16000, step=height + 1)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)


fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 16000])
plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.show()

<ipython-input-13-36b975ae737c>:4: RuntimeWarning: divide by zero encountered in log
  log_spec = np.log(spectrogram.T)
<ipython-input-13-36b975ae737c>:8: MatplotlibDeprecationWarning: shading='flat' when X and Y have the same dimensions as C is deprecated since 3.3.  Either specify the corners of the quadrilaterals with X and Y, or pass shading='auto', 'nearest' or 'gouraud', or set rcParams['pcolor.shading'].  This will become an error two minor releases later.
  ax.pcolormesh(X, Y, log_spec)

def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    spectrogram = tf.expand_dims(spectrogram, -1)
    label_id = tf.argmax(int(label == label_strings))
    return spectrogram, label_id

spectrogram_ds = waveform_ds.map(
    get_spectrogram_and_label_id, num_parallel_calls=AUTOTUNE)

rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 10))

for i, (spectrogram, label_id) in enumerate(spectrogram_ds.take(n)):
  r = i // cols
  c = i % cols
  ax = axes[r][c]
  plot_spectrogram(np.squeeze(spectrogram.numpy()), ax)
  ax.set_title(label_strings[label_id.numpy()])
  ax.axis('off')

plt.show()

<ipython-input-13-36b975ae737c>:4: RuntimeWarning: divide by zero encountered in log
  log_spec = np.log(spectrogram.T)
<ipython-input-13-36b975ae737c>:8: MatplotlibDeprecationWarning: shading='flat' when X and Y have the same dimensions as C is deprecated since 3.3.  Either specify the corners of the quadrilaterals with X and Y, or pass shading='auto', 'nearest' or 'gouraud', or set rcParams['pcolor.shading'].  This will become an error two minor releases later.
  ax.pcolormesh(X, Y, log_spec)

5. MFCC

Extract MFCC

from python_speech_features import mfcc
import scipy.io.wavfile as wav

mfccs = {}

for wave in filenames:
    wave = wave.numpy().decode('utf-8')
    (rate, sig) = wav.read(wave)
    mfccs[wave] = mfcc(sig, rate, nfft=2000)

example_mfcc = list(mfccs.values())[0]

print('Total MFCCs:', len(mfccs))
print()

print('Example_MFCC')
print('Number of windows =', example_mfcc.shape[0])
print('Length of each feature =', example_mfcc.shape[1])

example_mfcc = example_mfcc.T
plt.matshow(example_mfcc)
plt.title('Example_MFCC')

Total MFCCs: 7708

Example_MFCC
Number of windows = 96
Length of each feature = 13

Text(0.5, 1.05, 'Example_MFCC')

6. Train

We have done proprocessing for the training set. Now we repeat the same steps for the validation set and the testing set.

def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)
    output_ds = output_ds.map(
      get_spectrogram_and_label_id,  num_parallel_calls=AUTOTUNE)
    return output_ds

train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

Split data into batches

batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

Reduce read latency during training

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

Model

for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(label_strings)

norm_layer = preprocessing.Normalization()
norm_layer.adapt(spectrogram_ds.map(lambda x, _: x))

model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32, 32), 
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

Input shape: (124, 129, 1)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
resizing (Resizing)          (None, 32, 32, 1)         0         
_________________________________________________________________
normalization (Normalization (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d (Conv2D)              (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1605760   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
=================================================================
Total params: 1,625,869
Trainable params: 1,625,866
Non-trainable params: 3
_________________________________________________________________

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

Train

EPOCHS = 25
history = model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/25
24/97 [======>.......................] - ETA: 17s - loss: 2.2465 - accuracy: 0.1862

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-25-944820eea69a> in <module>
      1 EPOCHS = 25
----> 2 history = model.fit(
      3     train_ds,
      4     validation_data=val_ds,
      5     epochs=EPOCHS,

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
    106   def _method_wrapper(self, *args, **kwargs):
    107     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
--> 108       return method(self, *args, **kwargs)
    109 
    110     # Running inside `run_distribute_coordinator` already.

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1096                 batch_size=batch_size):
   1097               callbacks.on_train_batch_begin(step)
-> 1098               tmp_logs = train_function(iterator)
   1099               if data_handler.should_sync:
   1100                 context.async_wait()

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    778       else:
    779         compiler = "nonXla"
--> 780         result = self._call(*args, **kwds)
    781 
    782       new_tracing_count = self._get_tracing_count()

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    805       # In this case we have created variables on the first call, so we run the
    806       # defunned version which is guaranteed to never create variables.
--> 807       return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    808     elif self._stateful_fn is not None:
    809       # Release the lock early so that multiple threads can perform the call

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
   2827     with self._lock:
   2828       graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   2830 
   2831   @property

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs, cancellation_manager)
   1841       `args` and `kwargs`.
   1842     """
-> 1843     return self._call_flat(
   1844         [t for t in nest.flatten((args, kwargs), expand_composites=True)
   1845          if isinstance(t, (ops.Tensor,

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1921         and executing_eagerly):
   1922       # No tape is watching; skip to running the function.
-> 1923       return self._build_call_outputs(self._inference_function.call(
   1924           ctx, args, cancellation_manager=cancellation_manager))
   1925     forward_backward = self._select_forward_and_backward_functions(

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
    543       with _InterpolateFunctionError(self):
    544         if cancellation_manager is None:
--> 545           outputs = execute.execute(
    546               str(self.signature.name),
    547               num_outputs=self._num_outputs,

~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     57   try:
     58     ctx.ensure_initialized()
---> 59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:

KeyboardInterrupt:

Save model

model_dir = 'models'
# Create a folder and save the model
model_name = 'model_single_digit' #Make sure you change this name. DO NOT OVERWRITE TRAINED MODELS.
# model.save(os.path.join(model_dir, model_name))

Plot loss

metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

Feedforward Neural Network

def create_multi_classification_model(input_shape):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(256, shape = input_shape, activation='relu'))
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dense(64, ))
    model.add(tf.keras.layers.Dense(32, ))
    model.add(tf.keras.layers.Dense(num_labels, activation='softmax'))

    return model

FFN_model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(64, 64), 
    norm_layer,
    layers.Dense(64, activation='relu'),
    layers.Dense(32, ),
    layers.Dense(16, ),
    layers.Dense(8, ),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(num_labels, activation='softmax'),
])

# loss = loss = tf.keras.losses.SparseCategoricalCrossentropy()

# metrics = tf.keras.metrics.SparseCategoricalAccuracy() 

FFN_model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)



FFN_model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
resizing_12 (Resizing)       (None, 64, 64, 1)         0         
_________________________________________________________________
normalization_4 (Normalizati multiple                  3         
_________________________________________________________________
dense_101 (Dense)            (None, 64, 64, 64)        128       
_________________________________________________________________
dense_102 (Dense)            (None, 64, 64, 32)        2080      
_________________________________________________________________
dense_103 (Dense)            (None, 64, 64, 16)        528       
_________________________________________________________________
dense_104 (Dense)            (None, 64, 64, 8)         136       
_________________________________________________________________
dropout_16 (Dropout)         (None, 64, 64, 8)         0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 32768)             0         
_________________________________________________________________
dense_105 (Dense)            (None, 10)                327690    
=================================================================
Total params: 330,565
Trainable params: 330,562
Non-trainable params: 3
_________________________________________________________________

EPOCHS = 25
FFN_history = FFN_model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/25
97/97 [==============================] - 23s 238ms/step - loss: 2.1803 - accuracy: 0.2768 - val_loss: 2.0775 - val_accuracy: 0.3779
Epoch 2/25
97/97 [==============================] - 23s 241ms/step - loss: 2.0001 - accuracy: 0.4629 - val_loss: 1.9900 - val_accuracy: 0.4688
Epoch 3/25
97/97 [==============================] - 19s 200ms/step - loss: 1.9585 - accuracy: 0.5016 - val_loss: 1.9354 - val_accuracy: 0.5208
Epoch 4/25
97/97 [==============================] - 21s 221ms/step - loss: 1.9018 - accuracy: 0.5579 - val_loss: 1.9267 - val_accuracy: 0.5273
Epoch 5/25
97/97 [==============================] - 22s 228ms/step - loss: 1.8700 - accuracy: 0.5898 - val_loss: 1.9044 - val_accuracy: 0.5545
Epoch 6/25
97/97 [==============================] - 19s 193ms/step - loss: 1.8649 - accuracy: 0.5939 - val_loss: 1.9187 - val_accuracy: 0.5351
Epoch 7/25
97/97 [==============================] - 19s 192ms/step - loss: 1.8396 - accuracy: 0.6186 - val_loss: 1.8610 - val_accuracy: 0.5961
Epoch 8/25
97/97 [==============================] - 19s 196ms/step - loss: 1.8133 - accuracy: 0.6463 - val_loss: 1.8541 - val_accuracy: 0.6065
Epoch 9/25
97/97 [==============================] - 17s 180ms/step - loss: 1.8492 - accuracy: 0.6085 - val_loss: 1.9216 - val_accuracy: 0.5364
Epoch 10/25
97/97 [==============================] - 19s 193ms/step - loss: 1.8451 - accuracy: 0.6147 - val_loss: 1.9261 - val_accuracy: 0.5351
Epoch 00010: early stopping

metrics = FFN_history.history
plt.plot(FFN_history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

Models run on letters

dataset_type = choices[1]

download_from = 'origin'

data_dir = pathlib.Path('data/avicar_some'+dataset_type)

download_data(data_dir, dataset_type, download_from)

Downloading data from http://www.isle.illinois.edu/speech_web_lg/data/avicar_someletters.zip
444841984/444836550 [==============================] - 405s 1us/step

if dataset_type == 'digits':
    label_strings = np.array([str(num) for num in range(0,10)])
elif dataset_type == 'letters':
    label_strings = np.array(list(string.ascii_lowercase))
else:
    label_strings = np.array([str(num) for num in range(0,10)] + list(string.ascii_lowercase))
    
print(dataset_type + ":", label_strings)

letters: ['a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r'
 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']

filenames = tf.io.gfile.glob(str(data_dir)+"*/*.wav")
print("Example filename:", filenames[99])
print()

if dataset_type == "digits":
    # Filter out non-digit files
    filenames = [fname for fname in filenames if fname[32].isdigit()]
    
    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[32]==label_strings[i]])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")
    
elif dataset_type == 'letters':
    # Filter out non-letter files
    filenames = [fname for fname in filenames if not(fname[32].isdigit())]

    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[33]==label_strings[i].upper()])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")

num_samples = len(filenames)
print('# total examples:', num_samples)
print()

Example filename: data/avicar_someletters/AM5_35U_LN_C1_M3.wav

# examples for "a": 543
# examples for "b": 543
# examples for "c": 544
# examples for "d": 544
# examples for "e": 544
# examples for "f": 544
# examples for "g": 543
# examples for "h": 543
# examples for "i": 544
# examples for "j": 544
# examples for "k": 544
# examples for "l": 544
# examples for "m": 542
# examples for "n": 543
# examples for "o": 543
# examples for "p": 544
# examples for "q": 543
# examples for "r": 544
# examples for "s": 543
# examples for "t": 544
# examples for "u": 544
# examples for "v": 543
# examples for "w": 544
# examples for "x": 544
# examples for "y": 543
# examples for "z": 544
# total examples: 14132

filenames = tf.random.shuffle(filenames)

print('Example file tensor:', filenames[0])

Example file tensor: tf.Tensor(b'data/avicar_someletters/AF1_35D_LG_C2_M3.wav', shape=(), dtype=string)

TRAIN_PORTION = 0.8
VAL_PORTION = 0.1
TEST_PORTION = 0.1

train_end = int(num_samples*TRAIN_PORTION)
val_end = train_end + int(num_samples*VAL_PORTION)

train_files = filenames[:train_end]
val_files = filenames[train_end: val_end]
test_files = filenames[val_end:]

print('Training set size:', len(train_files))
print('Validation set size:', len(val_files))
print('Test set size:', len(test_files))

Training set size: 11305
Validation set size: 1413
Test set size: 1414

def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    # be careful with data type here
    # this function must return a tensor
    label_tensor = tf.strings.substr(parts[-1], pos=9, len=1)
    return label_tensor

# print(get_label(train_files[1]))

def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
files_ds = tf.data.Dataset.from_tensor_slices(train_files)
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)

rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 12))
for i, (audio, label) in enumerate(waveform_ds.take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    ax.plot(audio.numpy())
    ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    label = label.numpy().decode('utf-8')
    ax.set_title(label)

plt.show()

for waveform, label in waveform_ds.take(1):
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)
print('Audio playback')
display.display(display.Audio(waveform, rate=16000))

Label: G
Waveform shape: (18366,)
Spectrogram shape: (124, 129)
Audio playback

fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 16000])
plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.show()

spectrogram_ds = waveform_ds.map(
    get_spectrogram_and_label_id, num_parallel_calls=AUTOTUNE)
rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 10))

for i, (spectrogram, label_id) in enumerate(spectrogram_ds.take(n)):
  r = i // cols
  c = i % cols
  ax = axes[r][c]
  plot_spectrogram(np.squeeze(spectrogram.numpy()), ax)
  ax.set_title(label_strings[label_id.numpy()])
  ax.axis('off')

plt.show()

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:4: RuntimeWarning: divide by zero encountered in log
  after removing the cwd from sys.path.

train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(label_strings)

norm_layer = preprocessing.Normalization()
norm_layer.adapt(spectrogram_ds.map(lambda x, _: x))

letter_cnn_model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32, 32), 
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

letter_cnn_model.summary()

Input shape: (124, 129, 1)
Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
resizing_9 (Resizing)        (None, 32, 32, 1)         0         
_________________________________________________________________
normalization_3 (Normalizati (None, 32, 32, 1)         3         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 14, 14, 64)        0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 12544)             0         
_________________________________________________________________
dense_92 (Dense)             (None, 128)               1605760   
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_93 (Dense)             (None, 26)                3354      
=================================================================
Total params: 1,627,933
Trainable params: 1,627,930
Non-trainable params: 3
_________________________________________________________________

letter_cnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

EPOCHS = 25
letter_cnn_history =letter_cnn_model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/25

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-116-1c9f526b769f> in <module>
      4     validation_data=val_ds,
      5     epochs=EPOCHS,
----> 6     callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
      7 )

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
     64   def _method_wrapper(self, *args, **kwargs):
     65     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
---> 66       return method(self, *args, **kwargs)
     67 
     68     # Running inside `run_distribute_coordinator` already.

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
    846                 batch_size=batch_size):
    847               callbacks.on_train_batch_begin(step)
--> 848               tmp_logs = train_function(iterator)
    849               # Catch OutOfRangeError for Datasets of unknown size.
    850               # This blocks until the batch has finished executing.

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    578         xla_context.Exit()
    579     else:
--> 580       result = self._call(*args, **kwds)
    581 
    582     if tracing_count == self._get_tracing_count():

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    625       # This is the first call of __call__, so we have to initialize.
    626       initializers = []
--> 627       self._initialize(args, kwds, add_initializers_to=initializers)
    628     finally:
    629       # At this point we know that the initialization is complete (or less

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    504     self._concrete_stateful_fn = (
    505         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 506             *args, **kwds))
    507 
    508     def invalid_creator_scope(*unused_args, **unused_kwds):

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2444       args, kwargs = None, None
   2445     with self._lock:
-> 2446       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2447     return graph_function
   2448 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   2775 
   2776       self._function_cache.missed.add(call_context_key)
-> 2777       graph_function = self._create_graph_function(args, kwargs)
   2778       self._function_cache.primary[cache_key] = graph_function
   2779       return graph_function, args, kwargs

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   2665             arg_names=arg_names,
   2666             override_flat_arg_shapes=override_flat_arg_shapes,
-> 2667             capture_by_value=self._capture_by_value),
   2668         self._function_attributes,
   2669         # Tell the ConcreteFunction to clean up its graph once it goes out of

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    979         _, original_func = tf_decorator.unwrap(python_func)
    980 
--> 981       func_outputs = python_func(*func_args, **func_kwargs)
    982 
    983       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    439         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    440         # the function a weak reference to itself to avoid a reference cycle.
--> 441         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    442     weak_wrapped_fn = weakref.ref(wrapped_fn)
    443 

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    966           except Exception as e:  # pylint:disable=broad-except
    967             if hasattr(e, "ag_error_metadata"):
--> 968               raise e.ag_error_metadata.to_exception(e)
    969             else:
    970               raise

ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:886 __call__
        self.name)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:180 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer sequential_21 is incompatible with the layer: expected ndim=4, found ndim=8. Full shape received: [None, None, None, None, None, None, 129, 1]

7. Test

test_audio = []
test_labels = []

for audio, label in test_ds:
  test_audio.append(audio.numpy())
  test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(FFN_model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

Test set accuracy: 54%

Plot confusion matrix

import seaborn as sns
confusion_mtx = tf.math.confusion_matrix(y_true, y_pred) 
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=label_strings, yticklabels=label_strings, 
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

CarSpeech Classifier