Файл: Направление подготовки 09. 03. 04 Программная инженерия.docx
ВУЗ: Не указан
Категория: Не указан
Дисциплина: Не указана
Добавлен: 29.10.2023
Просмотров: 224
Скачиваний: 2
ВНИМАНИЕ! Если данный файл нарушает Ваши авторские права, то обязательно сообщите нам.
//doi.org/10.1145/3197517.3201357.
-
Eff-UNet: A Novel Architecture for Semantic Segmentation in Unstructured Environment / B. Baheti [и др.] // 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). — 2020. — С. 1473—1481. — DOI: 10.1109/CVPRW50498.2020.00187. -
Exploring convolutional, recurrent, and hybrid deep neural networks for speech and music detection in a large audio dataset / D. de Benito [и др.] // EURASIP Journal on Audio, Speech, and Music Processing. — 2019. — Июнь. — Т. 2019. — DOI: 10.1186/s13636-019-0152-1. -
Gallardo-Antolín A., Montero J. M. On combining acoustic and modulation spectrograms in an attention LSTM-based system for speech intelligibility level classification // Neurocomputing. — 2021. — Т. 456. — С. 49—60. — ISSN 0925-2312. — DOI: https : / / doi . org / 10 . 1016 / j .
neucom.2021.05.065. — URL: https://www.sciencedirect.com/science/article/pii/
S0925231221008183.
-
KingmaD.P., BaJ.Adam: A Method for Stochastic Optimization // 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings / под ред. Y. Bengio, Y. LeCun. — 2015. — URL: http://arxiv.org/abs/1412. 6980.
А. VideoEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F from params import VideoHyperParams
def init_weights(m):
if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0.01)
class VideoEncoder(nn.Module):
def
init(self):
super(VideoEncoder, self).init()
self.conv1 = nn.Conv2d(1, 1, kernel_size=3, padding=1)
torch.nn.init.xavier_uniform(self.conv1.weight) self.conv1.bias.data.fill_(0.01)
self.conv2 = nn.Conv2d(1, 1, kernel_size=3, padding=1)
torch.nn.init.xavier_uniform(self.conv2.weight) self.conv2.bias.data.fill_(0.01)
self.BiLSTM = nn.LSTM(input_size=int(VideoHyperParams.EMBENDING_DIM),
hidden_size=int(VideoHyperParams.EMBENDING_DIM), num_layers=1,
batch_first=True)
self.BiLSTM_proj = nn.Linear(in_features=int(VideoHyperParams.EMBENDING_DIM),
out_features=int(VideoHyperParams.EMBENDING_DIM))
torch.nn.init.xavier_uniform(self.BiLSTM_proj.weight) self.BiLSTM_proj.bias.data.fill_(0.01)
def forward(self, images):
batch_size, numbers_of_frames, c, h, w = images.shape
x = images[:, 0] x = self.conv1(x)
x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)
out, (hn, cn) = self.BiLSTM(x)
for i in range(1, numbers_of_frames): x = images[:, i]
x = self.conv1(x)
x = F.leaky_relu(self.conv2(x)) x = torch.squeeze(x)
out, (hn, cn) = self.BiLSTM(x, (hn, cn))
res = self.BiLSTM_proj(out) return res
В. AudioEncoder
import torch.nn as nn
from params import AudioHyperParams
class AudioEncoder(nn.Module):
def
init(self):
super(AudioEncoder, self).init()
self.BiLSTM = nn.LSTM(int(AudioHyperParams.MEL_SAMPLES),
int(AudioHyperParams.EMBENDING_DIM), num_layers=2, batch_first=True)
self.BiLSTM_proj = nn.Linear(int(AudioHyperParams.EMBENDING_DIM),
int(AudioHyperParams.EMBENDING_DIM))
def forward(self, spec):
res, (hn, cn) = self.BiLSTM(spec) res = self.BiLSTM_proj(res)
res = res.transpose(1, 2) return res
Г. Генератор
import torch.nn as nn import torch
from params import AudioHyperParams
def init_weights(m):
if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Generator(nn.Module):
def
init(self):
super(Generator, self).init()
self.conv1d_fisrt = nn.Sequential( nn.Conv1d(in_channels=int(AudioHyperParams.EMBENDING_DIM),
out_channels=125, kernel_size=3),
nn.BatchNorm1d(125), nn.LeakyReLU(True),
nn.Conv1d(in_channels=125,
out_channels=80, kernel_size=3),
nn.BatchNorm1d(80), nn.LeakyReLU(True)
)
self.conv1d_fisrt.apply(init_weights) self.conv1d_second = nn.Sequential(
nn.Conv1d(in_channels=588,
out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),
nn.LeakyReLU(True),
nn.Conv1d(in_channels=int(AudioHyperParams.MEL_SAMPLES), out_channels=int(AudioHyperParams.MEL_SAMPLES), kernel_size=3, padding=1),
)
self.conv1d_second.apply(init_weights) def forward(self, x):
x = self.conv1d_fisrt(x) x = x.transpose(1, 2)
out = self.conv1d_second(x)
out = out.transpose(1, 2) return out
Д. Дискриминатор
import torch
import torch.nn as nn
from params import AudioHyperParams, VideoHyperParams
def init_weights(m):
if isinstance(m, nn.Conv1d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
if isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01)
class Discriminator(nn.Module):
def
init(self):
super(Discriminator, self).init()
self.frames_post_conv = nn.Sequential( nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),
out_channels=int(VideoHyperParams.NUMBER_OF_FRAMES), kernel_size=5, stride=2),
nn.ReLU(True), nn.Conv2d(in_channels=int(VideoHyperParams.NUMBER_OF_FRAMES),
out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=6, stride=3),
nn.ReLU(True), nn.Conv2d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),
out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), kernel_size=7, stride=3),
)
self.frames_post_conv.apply(init_weights)
self.mel_conv = nn.Conv1d(in_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS), out_channels=int(AudioHyperParams.NUMBER_OF_MEL_BANDS),
kernel_size=3, stride=2, padding=1)
torch.nn.init.xavier_uniform(self.mel_conv.weight) self.mel_conv.bias.data.fill_(0.01)
self.down_sampling = nn.Sequential( nn.Conv1d(80, 80,
kernel_size=4, stride=2), nn.ReLU(True),
nn.Conv1d(80, 120,
kernel_size=6, stride=3),
nn.ReLU(True), nn.Conv1d(120, 160,
kernel_size=6, stride=3), nn.Conv1d(160, 200,
kernel_size=6, stride=3), nn.Conv1d(200, 1,
kernel_size=6, stride=3),
)
self.down_sampling.apply(init_weights) self.sigmoid = nn.Sigmoid()
def forward(self, frames, mel_spec): mel = self.mel_conv(mel_spec)
frames = torch.squeeze(frames) bs, _, _, _ = frames.shape
frames = self.frames_post_conv(frames) frames = torch.reshape(frames, (bs, 80, −1))
concat = torch.cat([frames, mel], dim=2) out = self.down_sampling(concat)
out = torch.squeeze(out) out = self.sigmoid(out)
return out
Е. Тренировка нейросетевого компонента
import torch
import torch.optim as optim import torch.nn as nn
import torchvision.transforms as transforms from data import VideoDataset
from torch.utils.data import DataLoader import os
from tqdm import
tqdm import warnings
import matplotlib.pyplot as plt import librosa.display
warnings.simplefilter(action=’ignore’, category=UserWarning) from discriminator import Discriminator
from main_generator import MainGenerator
from params import TrainParams
device = torch.device(’cuda’ if torch.cuda.is_available() else ’cpu’) G = MainGenerator().to(device)
D = Discriminator().to(device)
G_optimizer = optim.Adam(G.parameters(), lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999))
D_optimizer = optim.Adam(D.parameters(),
lr=TrainParams.LEARNING_RATE, betas=(TrainParams.BETA1, 0.999)) loss_bce_gen = nn.BCELoss()
loss_bce_discr = nn.BCELoss() loss_mce = nn.MSELoss()
test_transformer = transforms.Compose([ transforms.ToTensor(),
])
test_ds = VideoDataset(root_dir=’C:/Projects/soundGAN−main/features_data/hammer’, transform=test_transformer)
loader = DataLoader(test_ds, batch_size=int(TrainParams.BATCH_SIZE), shuffle=True)
print(’Loader initialized’)
d_scores = open(’d_scores.txt’, ’w’) g_scores = open(’g_scores.txt’, ’w’) os.makedirs(’checkpoints’, exist_ok=True)
def set_requires_grad(nets, requires_grad=False): if not isinstance(nets, list):
nets = [nets] for net in nets:
if net is not None:
for param in net.parameters(): param.requires_grad = requires_grad
os.makedirs(’figures’, exist_ok=True)
if name == ’main’:
print(f’Start training on {device}’)
for epoch in range(TrainParams.EPOCHS):
progress_bar = tqdm(enumerate(loader), total=len(loader)) for idx, batch in progress_bar:
idx += 1
real_mel_spec = batch[0].to(device) video_frames = batch[1].to(device)
real_labels = torch.ones(TrainParams.BATCH_SIZE).to(device) fake_labels = torch.zeros(TrainParams.BATCH_SIZE).to(device)
# Train Discriminator set_requires_grad(G, False) set_requires_grad(D, True)
with torch.no_grad():
fake_mel_spec = G(video_frames, real_mel_spec)
fake_outputs = D(video_frames, fake_mel_spec.detach()) real_outputs = D(video_frames, real_mel_spec)
real_loss = loss_bce_discr(real_outputs, real_labels) fake_loss = loss_bce_discr(fake_outputs, fake_labels)
D_loss = (real_loss + fake_loss) * 0.5 D_optimizer.zero_grad() D_loss.backward()
D_optimizer.step() # Train Generator
set_requires_grad(G, True) set_requires_grad(D,