Файл: Направление подготовки 09. 03. 04 Программная инженерия.docx

Eff-UNet: A Novel Architecture for Semantic Segmentation in Unstructured Environment / B. Baheti [и др.] // 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). — 2020. — С. 1473—1481. — DOI: 10.1109/CVPRW50498.2020.00187.
Exploring convolutional, recurrent, and hybrid deep neural networks for speech and music detection in a large audio dataset / D. de Benito [и др.] // EURASIP Journal on Audio, Speech, and Music Processing. — 2019. — Июнь. — Т. 2019. — DOI: 10.1186/s13636-019-0152-1.
Gallardo-Antolín A., Montero J. M. On combining acoustic and modulation spectrograms in an attention LSTM-based system for speech intelligibility level classification // Neurocomputing. — 2021. — Т. 456. — С. 49—60. — ISSN 0925-2312. — DOI: https : / / doi . org / 10 . 1016 / j .

neucom.2021.05.065. — URL: https://www.sciencedirect.com/science/article/pii/

S0925231221008183.

KingmaD.P., BaJ.Adam: A Method for Stochastic Optimization // 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings / под ред. Y. Bengio, Y. LeCun. — 2015. — URL: http://arxiv.org/abs/1412. 6980.

А. VideoEncoder

import torch

import torch.nn as nn

import torch.nn.functional as F from params import VideoHyperParams

def init_weights(m):

if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0.01)
class VideoEncoder(nn.Module):

def

init(self):

super(VideoEncoder, self).init()

self.conv1 = nn.Conv2d(1, 1, kernel_size=3, padding=1)
torch.nn.init.xavier_uniform(self.conv1.weight) self.conv1.bias.data.fill_(0.01)

self.conv2 = nn.Conv2d(1, 1, kernel_size=3, padding=1)

torch.nn.init.xavier_uniform(self.conv2.weight) self.conv2.bias.data.fill_(0.01)

self.BiLSTM = nn.LSTM(input_size=int(VideoHyperParams.EMBENDING_DIM),

hidden_size=int(VideoHyperParams.EMBENDING_DIM), num_layers=1,

batch_first=True)

self.BiLSTM_proj = nn.Linear(in_features=int(VideoHyperParams.EMBENDING_DIM),

out_features=int(VideoHyperParams.EMBENDING_DIM))
torch.nn.init.xavier_uniform(self.BiLSTM_proj.weight) self.BiLSTM_proj.bias.data.fill_(0.01)

def forward(self, images):

batch_size, numbers_of_frames, c, h, w = images.shape

x = images[:, 0] x = self.conv1(x)

x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)

out, (hn, cn) = self.BiLSTM(x)

for i in range(1, numbers_of_frames): x = images[:, i]

x = self.conv1(x)

x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)

out, (hn, cn) = self.BiLSTM(x, (hn, cn))

res = self.BiLSTM_proj(out) return res

В. AudioEncoder

import torch.nn as nn

from params import AudioHyperParams
class AudioEncoder(nn.Module):

def

init(self):

super(AudioEncoder, self).init()

self.BiLSTM = nn.LSTM(int(AudioHyperParams.MEL_SAMPLES),

int(AudioHyperParams.EMBENDING_DIM), num_layers=2, batch_first=True)

self.BiLSTM_proj = nn.Linear(int(AudioHyperParams.EMBENDING_DIM),

int(AudioHyperParams.EMBENDING_DIM))

def forward(self, spec):

res, (hn, cn) = self.BiLSTM(spec) res = self.BiLSTM_proj(res)

res = res.transpose(1, 2) return res

Г. Генератор

import torch.nn as nn import torch

from params import AudioHyperParams
def init_weights(m):

if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Generator(nn.Module):

def

init(self):

super(Generator, self).init()

self.conv1d_fisrt = nn.Sequential( nn.Conv1d(in_channels=int(AudioHyperParams.EMBENDING_DIM),

out_channels=125, kernel_size=3),

nn.BatchNorm1d(125), nn.LeakyReLU(True),

nn.Conv1d(in_channels=125,

out_channels=80, kernel_size=3),

nn.BatchNorm1d(80), nn.LeakyReLU(True)

)

self.conv1d_fisrt.apply(init_weights) self.conv1d_second = nn.Sequential(

nn.Conv1d(in_channels=588,

out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),

nn.LeakyReLU(True),

nn.Conv1d(in_channels=int(AudioHyperParams.MEL_SAMPLES), out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),

)

self.conv1d_second.apply(init_weights) def forward(self, x):

x = self.conv1d_fisrt(x) x = x.transpose(1, 2)

out = self.conv1d_second(x)

out = out.transpose(1, 2) return out

Д. Дискриминатор

import torch

import torch.nn as nn

from params import AudioHyperParams, VideoHyperParams

def init_weights(m):

if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Discriminator(nn.Module):

def

init(self):

super(Discriminator, self).init()

self.frames_post_conv = nn.Sequential( nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),

out_channels=int(VideoHyperParams.NUMBER_OF_FRAMES), kernel_size=5, stride=2),

nn.ReLU(True), nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),

out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=6, stride=3),

nn.ReLU(True), nn.Conv2d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),

out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=7, stride=3),

)

self.frames_post_conv.apply(init_weights)
self.mel_conv = nn.Conv1d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),

kernel_size=3, stride=2, padding=1)

torch.nn.init.xavier_uniform(self.mel_conv.weight) self.mel_conv.bias.data.fill_(0.01)
self.down_sampling = nn.Sequential( nn.Conv1d(80, 80,

kernel_size=4, stride=2), nn.ReLU(True),

nn.Conv1d(80, 120,

kernel_size=6, stride=3),

nn.ReLU(True), nn.Conv1d(120, 160,

kernel_size=6, stride=3), nn.Conv1d(160, 200,

kernel_size=6, stride=3), nn.Conv1d(200, 1,

kernel_size=6, stride=3),

)
self.down_sampling.apply(init_weights) self.sigmoid = nn.Sigmoid()
def forward(self, frames, mel_spec): mel = self.mel_conv(mel_spec)

frames = torch.squeeze(frames) bs, _, _, _ = frames.shape

frames = self.frames_post_conv(frames) frames = torch.reshape(frames, (bs, 80, −1))
concat = torch.cat([frames, mel], dim=2) out = self.down_sampling(concat)

out = torch.squeeze(out) out = self.sigmoid(out)

return out

Е. Тренировка нейросетевого компонента

import torch

import torch.optim as optim import torch.nn as nn

import torchvision.transforms as transforms from data import VideoDataset

from torch.utils.data import DataLoader import os

from tqdm import

tqdm import warnings

import matplotlib.pyplot as plt import librosa.display

warnings.simplefilter(action=’ignore’, category=UserWarning) from discriminator import Discriminator

from main_generator import MainGenerator

from params import TrainParams

device = torch.device(’cuda’ if torch.cuda.is_available() else ’cpu’) G = MainGenerator().to(device)

D = Discriminator().to(device)

G_optimizer = optim.Adam(G.parameters(), lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999))

D_optimizer = optim.Adam(D.parameters(),

lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999)) loss_bce_gen = nn.BCELoss()

loss_bce_discr = nn.BCELoss() loss_mce = nn.MSELoss()

test_transformer = transforms.Compose([ transforms.ToTensor(),

])

test_ds = VideoDataset(root_dir=’C:/Projects/soundGAN−main/features_data/hammer’, transform=test_transformer)

loader = DataLoader(test_ds, batch_size=int(TrainParams.BATCH_SIZE), shuffle=True)

print(’Loader initialized’)

d_scores = open(’d_scores.txt’, ’w’) g_scores = open(’g_scores.txt’, ’w’) os.makedirs(’checkpoints’, exist_ok=True)

def set_requires_grad(nets, requires_grad=False): if not isinstance(nets, list):

nets = [nets] for net in nets:

if net is not None:

for param in net.parameters(): param.requires_grad = requires_grad
os.makedirs(’figures’, exist_ok=True)
if name == ’main’:

print(f’Start training on {device}’)

for epoch in range(TrainParams.EPOCHS):

progress_bar = tqdm(enumerate(loader), total=len(loader)) for idx, batch in progress_bar:

idx += 1

real_mel_spec = batch[0].to(device) video_frames = batch[1].to(device)
real_labels = torch.ones(TrainParams.BATCH_SIZE).to(device) fake_labels = torch.zeros(TrainParams.BATCH_SIZE).to(device)
# Train Discriminator set_requires_grad(G, False) set_requires_grad(D, True)

with torch.no_grad():

fake_mel_spec = G(video_frames, real_mel_spec)

fake_outputs = D(video_frames, fake_mel_spec.detach()) real_outputs = D(video_frames, real_mel_spec)

real_loss = loss_bce_discr(real_outputs, real_labels) fake_loss = loss_bce_discr(fake_outputs, fake_labels)

D_loss = (real_loss + fake_loss) * 0.5 D_optimizer.zero_grad() D_loss.backward()

D_optimizer.step() # Train Generator

set_requires_grad(G, True) set_requires_grad(D,

Смотрите также файлы

Web страница как экономический инструмент продаж по компетенции Интернетмаркетинг.docx

Центра (микроцентра).docx

Контрольные вопросы для самостоятельной подготовки по теме семинарского занятия 2 Использование различных видов тренинга с целью личностного роста.docx

Модель асимметрии информации по ценам.docx

бтке дайынды туралы.docx