import os
import pathlib
import glob

import numpy as np
import tensorflow as tf
# Use seed 66 for consistant experiments
seed = 66
tf.random.set_seed(seed)
np.random.seed(seed)
choices = ['digits', 'letters', 'mixed']
dataset_type = choices[0]

data_dir = pathlib.Path('data/avicar_some'+dataset_type)

if dataset_type == 'digits':
    label_strings = np.array([str(num) for num in range(0,10)])
elif dataset_type == 'letters':
    label_strings = np.array(list(string.ascii_uppercase))
else:
    label_strings = np.array([str(num) for num in range(0,10)] + list(string.ascii_uppercase))
    
print(dataset_type + ":", label_strings)

filenames = tf.io.gfile.glob(str(data_dir)+"*/*.wav")
print("Example filename:", filenames[99])
print()

if dataset_type == "digits":
    # Filter out non-digit files
    filenames = [fname for fname in filenames if fname[32].isdigit()]
    
    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[32]==label_strings[i]])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")
    
elif dataset_type == 'letters':
    # Filter out non-letter files
    filenames = [fname for fname in filenames if not(fname[32].isdigit())]

    # Count # of examples for each label
    for i in range(len(label_strings)):
        num_examples = len([fname for fname in filenames if fname[33]==label_strings[i].upper()])
        print(f"""# examples for "{label_strings[i]}": {num_examples}""")

num_samples = len(filenames)
print('# total examples:', num_samples)
print()

filenames = tf.random.shuffle(filenames)

TRAIN_PORTION = 0.7
TEST_PORTION = 0.3

train_end = int(num_samples*TRAIN_PORTION)

train_files = filenames[:train_end]
test_files = filenames[train_end:]

print('Training set size:', len(train_files))
print('Test set size:', len(test_files))
digits: ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
Example filename: data/avicar_somedigits/AM3_35U_D0_C1_M3.wav

# examples for "0": 772
# examples for "1": 771
# examples for "2": 771
# examples for "3": 771
# examples for "4": 770
# examples for "5": 771
# examples for "6": 771
# examples for "7": 770
# examples for "8": 769
# examples for "9": 772
# total examples: 7708

Training set size: 5395
Test set size: 2313

Extract MFCC

from python_speech_features import mfcc
import scipy.io.wavfile as wav
def get_mfcc_dict(filenames):
    mfccs = {}
    for wave in filenames:
        wave = wave.numpy().decode('utf-8')
        (rate, sig) = wav.read(wave)
        mfccs[wave] = mfcc(sig, rate, nfft=2000)
    return mfccs
mfccs_train = get_mfcc_dict(train_files)
mfccs_test = get_mfcc_dict(test_files)

Prepare training and testing data

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    label = tf.strings.substr(parts[-1], pos=9, len=1)
    label_str = label.numpy().decode('utf-8')
    return label_str

def preprocess_mfcc(mfccs):
    X = []
    y = []
    for filename in mfccs:
        X.append(np.mean(mfccs[filename], axis=0))
        label = get_label(filename)
        y.append(label)
    return np.array(X), np.array(y)

training_data, training_label = preprocess_mfcc(mfccs_train)
testing_data, testing_label = preprocess_mfcc(mfccs_test)
from sklearn.model_selection import cross_val_score

SGDClassifier

from sklearn.linear_model import SGDClassifier

model = SGDClassifier(max_iter=10000)
model.fit(training_data, training_label)

cross_val_score(model, training_data, training_label, cv=10, scoring='accuracy')
array([0.26296296, 0.30185185, 0.26481481, 0.26851852, 0.29074074,
       0.30055659, 0.283859  , 0.30983302, 0.2987013 , 0.27087199])

GaussianNB

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(training_data, training_label)

cross_val_score(model, training_data, training_label, cv=10, scoring='accuracy')
array([0.32777778, 0.37592593, 0.36111111, 0.35      , 0.32962963,
       0.33766234, 0.33951763, 0.3135436 , 0.33766234, 0.33951763])

MLPClassifier

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(max_iter=10000)
model.fit(training_data, training_label)

cross_val_score(model, training_data, training_label, cv=10, scoring='accuracy')
array([0.46851852, 0.45740741, 0.45      , 0.48703704, 0.47962963,
       0.46382189, 0.48794063, 0.4471243 , 0.50463822, 0.48608534])

RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(training_data, training_label)

cross_val_score(model, training_data, training_label, cv=10, scoring='accuracy')
array([0.43148148, 0.46666667, 0.44259259, 0.44074074, 0.42407407,
       0.37105751, 0.43784787, 0.45083488, 0.44341373, 0.43784787])

KNeighborsClassifier

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(training_data, training_label)

cross_val_score(model, training_data, training_label, cv=10, scoring='accuracy')
array([0.33333333, 0.28148148, 0.29444444, 0.29814815, 0.29814815,
       0.27458256, 0.29684601, 0.30797774, 0.30612245, 0.28942486])