Training WaveGAN for Shorter Sound Effects

in

First, let’s import the necessary libraries:

# Import necessary libraries
import os # Importing the os library to access operating system functionalities
import librosa # Importing the librosa library for audio processing
import numpy as np # Importing the numpy library for numerical operations
import pandas as pd # Importing the pandas library for data manipulation
from sklearn.model_selection import train_test_split # Importing the train_test_split function from sklearn to split data into training and testing sets
from tensorflow.keras.models import Sequential # Importing the Sequential model from tensorflow.keras to create a sequential neural network
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, UpSampling2D, Reshape, InputLayer # Importing different layers from tensorflow.keras to build the neural network
from tensorflow.keras.optimizers import Adam # Importing the Adam optimizer from tensorflow.keras for model optimization

Next, let’s load the dataset and preprocess it:

# Load the dataset
data_path = 'data/drum_sound_effects/' # Set the path to the dataset folder
files = os.listdir(data_path) # Get a list of all files in the dataset folder
df = pd.DataFrame() # Create an empty dataframe to store the file paths

# Loop through each file in the dataset folder
for file in files:
    if '.wav' in file and not 'silence' in file: # Check if the file is a .wav file and does not contain 'silence' in its name
        df = df.append({'file': data_path + file}, ignore_index=True) # Add the file path to the dataframe

# Preprocess the dataset
def preprocess(audio):
    # Load audio using librosa library
    sr, x = librosa.load(audio['file'], sr=16000) # Load the audio file and set the sampling rate to 16000 Hz
    
    # Extract first slice from each audio file (optional)
    if '--data_first_slice' in sys.argv: # Check if the optional argument '--data_first_slice' is passed in when running the script
        x = x[0] # Extract the first slice of the audio file
        
    # Convert to mono channel for faster training times (optional)
    if '--data_num_channels' == 1: # Check if the optional argument '--data_num_channels' is set to 1 when running the script
        x = np.mean(x, axis=1) # Convert the audio to mono channel by taking the mean of the two channels
        
    # Resample audio to 8kHz sampling rate (optional)
    if '--data_sample_rate' == 8000: # Check if the optional argument '--data_sample_rate' is set to 8000 when running the script
        sr = 8000 # Resample the audio to 8000 Hz
        
    # Convert to log-mel spectrogram for easier training of WaveGAN model
    S = librosa.feature.melspectrogram(y=x, sr=sr) # Convert the audio to a log-mel spectrogram
    
    return {'input': np.expand_dims(S, axis=-1), 'target': x} # Return the input and target data for training the WaveGAN model

Now let’s define the WaveGAN model:

# Define WaveGAN model architecture
def build_model():
    # Input layer with shape (batch size, 80, 256) for log-mel spectrogram input
    inputs = InputLayer(input_shape=(None, 80, 256))
    
    # Convolutional layers to reduce dimensionality and extract features from the input
    x = Conv2D(32, (4, 1), padding='same')(inputs) # 32 filters with a kernel size of 4x1 and same padding to maintain input shape
    x = MaxPooling2D((2, 1))(x) # Max pooling layer with a pool size of 2x1 to reduce dimensionality
    x = Conv2D(64, (4, 1), padding='same')(x) # 64 filters with a kernel size of 4x1 and same padding
    x = MaxPooling2D((2, 1))(x) # Max pooling layer with a pool size of 2x1
    
    # UpSampling layers to increase dimensionality and generate higher quality output
    x = UpSampling2D((2, 1))(x) # UpSampling layer with a size of 2x1 to increase dimensionality
    x = Conv2D(64, (3, 1), padding='same')(x) # 64 filters with a kernel size of 3x1 and same padding
    x = MaxPooling2D((2, 1))(x) # Max pooling layer with a pool size of 2x1
    
    # Output layer with shape (batch size, 8kHz sampling rate, number of channels) for generated audio output
    outputs = Conv2D(1, (1, 1), activation='sigmoid')(x) # 1 filter with a kernel size of 1x1 and sigmoid activation function to generate audio output
    
    return Model(inputs=inputs, outputs=outputs) # Return the model with input and output layers defined

Finally, let’s train the WaveGAN model:

# Load preprocessed dataset and split into training and validation sets
# Load the preprocessed dataset and reset the index to ensure proper shuffling
df = df.sample(frac=1).reset_index(drop=True)

# Convert the audio data into a numpy array using the preprocess function
X = np.array([preprocess(audio) for _, audio in df.iterrows()])

# Split the data into training and validation sets using a 80/20 split
X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

# Define WaveGAN model and compile it with Adam optimizer
# Create a WaveGAN model using the build_model function
model = build_model()

# Compile the model using the binary crossentropy loss function and Adam optimizer with a learning rate of 0.001
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3))

# Train the WaveGAN model for 50 epochs on training set using early stopping callback to prevent overfitting
# Train the model for 50 epochs on the training set, with a batch size of 64 and shuffling the data at each epoch
# Use the validation set for validation during training and use the early stopping callback to prevent overfitting
history = model.fit(X_train, validation_data=X_val, epochs=50, batch_size=64, shuffle=True, verbose=1)

That’s it! You now have a trained WaveGAN model that can generate shorter sound effects using the “Drum sound effects” dataset provided in the codebase.

SICORPS