Файл: Направление подготовки 09. 03. 04 Программная инженерия.docx

ВУЗ: Не указан

Категория: Не указан

Дисциплина: Не указана

Добавлен: 29.10.2023

Просмотров: 225

Скачиваний: 2

ВНИМАНИЕ! Если данный файл нарушает Ваши авторские права, то обязательно сообщите нам.

//doi.org/10.1145/3197517.3201357.

  1. Eff-UNet: A Novel Architecture for Semantic Segmentation in Unstructured Environment / B. Baheti др.] // 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). 2020. С. 1473—1481. DOI: 10.1109/CVPRW50498.2020.00187.

  2. Exploring convolutional, recurrent, and hybrid deep neural networks for speech and music detection in a large audio dataset / D. de Benito [и др.] // EURASIP Journal on Audio, Speech, and Music Processing. — 2019. — Июнь. — Т. 2019. — DOI: 10.1186/s13636-019-0152-1.

  3. Gallardo-Antolín A., Montero J. M. On combining acoustic and modulation spectrograms in an attention LSTM-based system for speech intelligibility level classification // Neurocomputing. — 2021. Т. 456. С. 49—60. ISSN 0925-2312. DOI: https : / / doi . org / 10 . 1016 / j .

neucom.2021.05.065. URL: https://www.sciencedirect.com/science/article/pii/

S0925231221008183.

  1. KingmaD.P., BaJ.Adam: A Method for Stochastic Optimization // 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings / под ред. Y. Bengio, Y. LeCun. — 2015. — URL: http://arxiv.org/abs/1412. 6980.

А. VideoEncoder



import torch

import torch.nn as nn

import torch.nn.functional as F from params import VideoHyperParams

def init_weights(m):

if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0.01)
class VideoEncoder(nn.Module):


def

init(self):

super(VideoEncoder, self).init()

self.conv1 = nn.Conv2d(1, 1, kernel_size=3, padding=1)
torch.nn.init.xavier_uniform(self.conv1.weight) self.conv1.bias.data.fill_(0.01)

self.conv2 = nn.Conv2d(1, 1, kernel_size=3, padding=1)

torch.nn.init.xavier_uniform(self.conv2.weight) self.conv2.bias.data.fill_(0.01)

self.BiLSTM = nn.LSTM(input_size=int(VideoHyperParams.EMBENDING_DIM),


hidden_size=int(VideoHyperParams.EMBENDING_DIM), num_layers=1,

batch_first=True)

self.BiLSTM_proj = nn.Linear(in_features=int(VideoHyperParams.EMBENDING_DIM),

out_features=int(VideoHyperParams.EMBENDING_DIM))
torch.nn.init.xavier_uniform(self.BiLSTM_proj.weight) self.BiLSTM_proj.bias.data.fill_(0.01)

def forward(self, images):

batch_size, numbers_of_frames, c, h, w = images.shape

x = images[:, 0] x = self.conv1(x)

x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)

out, (hn, cn) = self.BiLSTM(x)

for i in range(1, numbers_of_frames): x = images[:, i]

x = self.conv1(x)

x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)

out, (hn, cn) = self.BiLSTM(x, (hn, cn))

res = self.BiLSTM_proj(out) return res

В. AudioEncoder



import torch.nn as nn

from params import AudioHyperParams
class AudioEncoder(nn.Module):


def

init(self):

super(AudioEncoder, self).init()

self.BiLSTM = nn.LSTM(int(AudioHyperParams.MEL_SAMPLES),

int(AudioHyperParams.EMBENDING_DIM), num_layers=2, batch_first=True)

self.BiLSTM_proj = nn.Linear(int(AudioHyperParams.EMBENDING_DIM),

int(AudioHyperParams.EMBENDING_DIM))


def forward(self, spec):

res, (hn, cn) = self.BiLSTM(spec) res = self.BiLSTM_proj(res)

res = res.transpose(1, 2) return res

Г. Генератор




import torch.nn as nn import torch

from params import AudioHyperParams
def init_weights(m):

if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Generator(nn.Module):


def

init(self):

super(Generator, self).init()

self.conv1d_fisrt = nn.Sequential( nn.Conv1d(in_channels=int(AudioHyperParams.EMBENDING_DIM),

out_channels=125, kernel_size=3),

nn.BatchNorm1d(125), nn.LeakyReLU(True),

nn.Conv1d(in_channels=125,

out_channels=80, kernel_size=3),

nn.BatchNorm1d(80), nn.LeakyReLU(True)

)

self.conv1d_fisrt.apply(init_weights) self.conv1d_second = nn.Sequential(

nn.Conv1d(in_channels=588,

out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),

nn.LeakyReLU(True),

nn.Conv1d(in_channels=int(AudioHyperParams.MEL_SAMPLES), out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),

)

self.conv1d_second.apply(init_weights) def forward(self, x):

x = self.conv1d_fisrt(x) x = x.transpose(1, 2)

out = self.conv1d_second(x)

out = out.transpose(1, 2) return out

Д. Дискриминатор



import torch

import torch.nn as nn

from params import AudioHyperParams, VideoHyperParams

def init_weights(m):

if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Discriminator(nn.Module):


def

init(self):

super(Discriminator, self).init()

self.frames_post_conv = nn.Sequential( nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),

out_channels=int(VideoHyperParams.NUMBER_OF_FRAMES), kernel_size=5, stride=2),

nn.ReLU(True), nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),

out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=6, stride=3),

nn.ReLU(True), nn.Conv2d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),

out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=7, stride=3),

)

self.frames_post_conv.apply(init_weights)
self.mel_conv = nn.Conv1d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),

kernel_size=3, stride=2, padding=1)

torch.nn.init.xavier_uniform(self.mel_conv.weight) self.mel_conv.bias.data.fill_(0.01)
self.down_sampling = nn.Sequential( nn.Conv1d(80, 80,

kernel_size=4, stride=2), nn.ReLU(True),

nn.Conv1d(80, 120,

kernel_size=6, stride=3),

nn.ReLU(True), nn.Conv1d(120, 160,

kernel_size=6, stride=3), nn.Conv1d(160, 200,

kernel_size=6, stride=3), nn.Conv1d(200, 1,

kernel_size=6, stride=3),

)
self.down_sampling.apply(init_weights) self.sigmoid = nn.Sigmoid()
def forward(self, frames, mel_spec): mel = self.mel_conv(mel_spec)

frames = torch.squeeze(frames) bs, _, _, _ = frames.shape

frames = self.frames_post_conv(frames) frames = torch.reshape(frames, (bs, 80, −1))
concat = torch.cat([frames, mel], dim=2) out = self.down_sampling(concat)

out = torch.squeeze(out) out = self.sigmoid(out)

return out

Е. Тренировка нейросетевого компонента



import torch

import torch.optim as optim import torch.nn as nn

import torchvision.transforms as transforms from data import VideoDataset

from torch.utils.data import DataLoader import os

from tqdm import
tqdm import warnings

import matplotlib.pyplot as plt import librosa.display

warnings.simplefilter(action=’ignore’, category=UserWarning) from discriminator import Discriminator

from main_generator import MainGenerator

from params import TrainParams

device = torch.device(’cuda’ if torch.cuda.is_available() else ’cpu’) G = MainGenerator().to(device)

D = Discriminator().to(device)

G_optimizer = optim.Adam(G.parameters(), lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999))

D_optimizer = optim.Adam(D.parameters(),

lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999)) loss_bce_gen = nn.BCELoss()

loss_bce_discr = nn.BCELoss() loss_mce = nn.MSELoss()

test_transformer = transforms.Compose([ transforms.ToTensor(),

])

test_ds = VideoDataset(root_dir=’C:/Projects/soundGAN−main/features_data/hammer’, transform=test_transformer)

loader = DataLoader(test_ds, batch_size=int(TrainParams.BATCH_SIZE), shuffle=True)

print(’Loader initialized’)

d_scores = open(’d_scores.txt’, ’w’) g_scores = open(’g_scores.txt’, ’w’) os.makedirs(’checkpoints’, exist_ok=True)

def set_requires_grad(nets, requires_grad=False): if not isinstance(nets, list):

nets = [nets] for net in nets:

if net is not None:

for param in net.parameters(): param.requires_grad = requires_grad
os.makedirs(’figures’, exist_ok=True)
if name == main’:

print(f’Start training on {device}’)

for epoch in range(TrainParams.EPOCHS):

progress_bar = tqdm(enumerate(loader), total=len(loader)) for idx, batch in progress_bar:

idx += 1

real_mel_spec = batch[0].to(device) video_frames = batch[1].to(device)
real_labels = torch.ones(TrainParams.BATCH_SIZE).to(device) fake_labels = torch.zeros(TrainParams.BATCH_SIZE).to(device)
# Train Discriminator set_requires_grad(G, False) set_requires_grad(D, True)

with torch.no_grad():

fake_mel_spec = G(video_frames, real_mel_spec)

fake_outputs = D(video_frames, fake_mel_spec.detach()) real_outputs = D(video_frames, real_mel_spec)

real_loss = loss_bce_discr(real_outputs, real_labels) fake_loss = loss_bce_discr(fake_outputs, fake_labels)

D_loss = (real_loss + fake_loss) * 0.5 D_optimizer.zero_grad() D_loss.backward()

D_optimizer.step() # Train Generator

set_requires_grad(G, True) set_requires_grad(D,