Classify audio files according to their noise level and audio type.
def get_label(file_path):
parts = tf.strings.split(file_path, os.path.sep)
label_tensor = tf.strings.substr(parts[-1], pos=9, len=1)
label = label_tensor.numpy().decode('utf-8')
return label
def check_type(fname, dataset_type):
label = get_label(fname)
if dataset_type == 'digits' and label.isdigit():
return True
elif dataset_type == 'letters' and not(label.isdigit()):
return True
return False
def get_fnames(data_dir):
return tf.io.gfile.glob(str(data_dir)+"*/*.wav")
def filter_fnames(fnames):
return [fname for fname in fnames if check_type(fname, dataset_type)]
def show_data_count(filenames, label_strings):
for i in range(len(label_strings)):
num_examples = len([fname for fname in filenames if get_label(fname)==label_strings[i]])
print(f"""# examples for "{label_strings[i]}": {num_examples}""")
num_samples = len(filenames)
print('# total examples:', num_samples)
print()
noise_levels = ['IDL', '35D', '35U', '55D', '55U']
def get_noise_level(file_path):
parts = tf.strings.split(file_path, os.path.sep)
level_tensor = tf.strings.substr(parts[-1], pos=4, len=3)
level = level_tensor.numpy().decode('utf-8')
return level
def classify_noise_levels(fnames):
noise_levels = ['IDL', '35D', '35U', '55D', '55U']
noise_levels_dict = {}
# initialize empty lists
for level in noise_levels:
noise_levels_dict[level] = []
# put each filename into a category
for fname in fnames:
noise_level = get_noise_level(fname)
noise_levels_dict[noise_level].append(fname)
return noise_levels_dict
def load_fnames(fname_path):
with open(fname_path, 'rb') as filehandle:
return pickle.load(filehandle)
dataset_type = 'digits'
data_dir = pathlib.Path('data/avicar_some'+dataset_type)
label_strings = np.array([str(num) for num in range(0,10)])
filenames = get_fnames(data_dir)
print("Example filename:", filenames[99])
print()
# Filter out non-digit files
filenames = filter_fnames(filenames)
# Count # of examples for each label
show_data_count(filenames, label_strings)
noise_level_dict = classify_noise_levels(filenames)
noise_level_dir = 'noise_levels'
digit_dir = 'digit_noise_levels'
csv_path = os.path.join(noise_level_dir, digit_dir)
if not os.path.exists(csv_path):
os.makedirs(csv_path)
for level in noise_level_dict:
name = level + '.data'
file_path = os.path.join(csv_path, name)
with open(file_path, 'wb') as filehandle:
pickle.dump(noise_level_dict[level], filehandle)
dataset_type = 'letters'
data_dir = pathlib.Path('data/avicar_some'+dataset_type)
label_strings = np.array(list(string.ascii_uppercase))
filenames = get_fnames(data_dir)
print("Example filename:", filenames[99])
print()
# Filter out non-digit files
filenames = filter_fnames(filenames)
# Count # of examples for each label
show_data_count(filenames, label_strings)
noise_level_dict = classify_noise_levels(filenames)
noise_level_dir = 'noise_levels'
digit_dir = 'letter_noise_levels'
csv_path = os.path.join(noise_level_dir, digit_dir)
if not os.path.exists(csv_path):
os.makedirs(csv_path)
for level in noise_level_dict:
name = level + '.data'
file_path = os.path.join(csv_path, name)
with open(file_path, 'wb') as filehandle:
pickle.dump(noise_level_dict[level], filehandle)
dataset_type = 'mixed'
data_dir = pathlib.Path('data/avicar_'+dataset_type)
label_strings = np.array([str(num) for num in range(0,10)] + list(string.ascii_uppercase))
filenames = get_fnames(data_dir)
print("Example filename:", filenames[99])
print()
# Filter out DD, DT, DZ
filenames = [fname for fname in filenames if fname[26:28] not in ['DD', 'DT', 'DZ']]
# Count # of examples for each label
show_data_count(filenames, label_strings)
noise_level_dict = classify_noise_levels(filenames)
noise_level_dir = 'noise_levels'
digit_dir = 'mixed_noise_levels'
csv_path = os.path.join(noise_level_dir, digit_dir)
if not os.path.exists(csv_path):
os.makedirs(csv_path)
for level in noise_level_dict:
name = level + '.data'
file_path = os.path.join(csv_path, name)
with open(file_path, 'wb') as filehandle:
pickle.dump(noise_level_dict[level], filehandle)
dataset_type = 'digits'
data_dir = pathlib.Path('data/augmented')
label_strings = np.array([str(num) for num in range(0,10)])
filenames = get_fnames(data_dir)
print("Example filename:", filenames[99])
print()
# Filter out non-digit files
filenames = filter_fnames(filenames)
# # Count # of examples for each label
# show_data_count(filenames, label_strings)
noise_level_dict = classify_noise_levels(filenames)
noise_level_dir = 'noise_levels'
digit_dir = 'digit_augmented_noise_levels'
csv_path = os.path.join(noise_level_dir, digit_dir)
if not os.path.exists(csv_path):
os.makedirs(csv_path)
for level in noise_level_dict:
name = level + '.data'
file_path = os.path.join(csv_path, name)
with open(file_path, 'wb') as filehandle:
pickle.dump(noise_level_dict[level], filehandle)