Implement ray for hyperparameter tuning
This commit is contained in:
parent
b910086893
commit
95c2b25268
5 changed files with 235 additions and 115 deletions
26
NeuralNetwork.py
Normal file
26
NeuralNetwork.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
from torch import nn
|
||||
|
||||
|
||||
class NeuralNetwork(nn.Module):
|
||||
def __init__(self, l1=120, l2=84):
|
||||
super(NeuralNetwork, self).__init__()
|
||||
self.conv_relu_stack = nn.Sequential(
|
||||
nn.Conv2d(3, 6, (5, 5)),
|
||||
nn.ReLU(),
|
||||
nn.MaxPool2d(2, 2),
|
||||
nn.Conv2d(6, 16, (5, 5)),
|
||||
nn.ReLU(),
|
||||
nn.MaxPool2d(2, 2)
|
||||
)
|
||||
self.linear_relu_stack = nn.Sequential(
|
||||
nn.Linear(16*(5**2), l1),
|
||||
nn.ReLU(),
|
||||
nn.Linear(l1, l2),
|
||||
nn.ReLU(),
|
||||
nn.Linear(l2, 10),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_relu_stack(x)
|
||||
x = x.view(-1, 16 * (5 ** 2))
|
||||
return self.linear_relu_stack(x)
|
44
dataset.py
Normal file
44
dataset.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from torch.utils.data import random_split, DataLoader
|
||||
from torchvision import datasets
|
||||
from torchvision.transforms import ToTensor
|
||||
|
||||
|
||||
def get_data(data_root, download=False):
|
||||
transform = ToTensor()
|
||||
# Download training data from open datasets.
|
||||
training_data = datasets.CIFAR10(
|
||||
root=data_root,
|
||||
train=True,
|
||||
download=download,
|
||||
transform=transform,
|
||||
)
|
||||
|
||||
# Download test data from open datasets.
|
||||
testing_data = datasets.CIFAR10(
|
||||
root=data_root,
|
||||
train=False,
|
||||
download=download,
|
||||
transform=transform,
|
||||
)
|
||||
|
||||
return training_data, testing_data
|
||||
|
||||
|
||||
def load_data(config, data_root):
|
||||
train_set, test_set = get_data(data_root)
|
||||
|
||||
test_abs = int(len(train_set) * 0.8)
|
||||
train_subset, test_subset = random_split(
|
||||
train_set, [test_abs, len(train_set) - test_abs])
|
||||
|
||||
train_loader = DataLoader(
|
||||
train_subset,
|
||||
batch_size=int(config["batch_size"]),
|
||||
shuffle=True,
|
||||
num_workers=2)
|
||||
test_loader = DataLoader(
|
||||
test_subset,
|
||||
batch_size=int(config["batch_size"]),
|
||||
shuffle=True,
|
||||
num_workers=2)
|
||||
return train_loader, test_loader
|
175
main.py
175
main.py
|
@ -1,126 +1,71 @@
|
|||
from os.path import isfile
|
||||
from functools import partial
|
||||
from os.path import join
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torchvision import datasets
|
||||
from torchvision.transforms import ToTensor
|
||||
from numpy.random import randint
|
||||
from ray import tune
|
||||
from ray.tune import CLIReporter
|
||||
from ray.tune.schedulers import ASHAScheduler
|
||||
from torch import nn, load, save
|
||||
from torch.cuda import is_available
|
||||
|
||||
from NeuralNetwork import NeuralNetwork
|
||||
from dataset import get_data
|
||||
from tests import test_accuracy
|
||||
from training import training
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
device = "cuda:0" if is_available() else "cpu"
|
||||
print(f"Using {device} device")
|
||||
|
||||
|
||||
def get_data(batch_size: int = 64):
|
||||
# Download training data from open datasets.
|
||||
training_data = datasets.CIFAR10(
|
||||
root="/home/flifloo/IA/data",
|
||||
train=True,
|
||||
download=True,
|
||||
transform=ToTensor(),
|
||||
)
|
||||
def main(data_root, num_samples=10, max_num_epochs=10, gpus_per_trial=1):
|
||||
get_data(data_root, True)
|
||||
|
||||
# Download test data from open datasets.
|
||||
testing_data = datasets.CIFAR10(
|
||||
root="/home/flifloo/IA/data",
|
||||
train=False,
|
||||
download=True,
|
||||
transform=ToTensor(),
|
||||
)
|
||||
config = {
|
||||
"l1": tune.sample_from(lambda _: 2 ** randint(2, 9)),
|
||||
"l2": tune.sample_from(lambda _: 2 ** randint(2, 9)),
|
||||
"lr": tune.loguniform(1e-4, 1e-1),
|
||||
"batch_size": tune.choice([2, 4, 8, 16])
|
||||
}
|
||||
scheduler = ASHAScheduler(
|
||||
metric="loss",
|
||||
mode="min",
|
||||
max_t=max_num_epochs,
|
||||
grace_period=1,
|
||||
reduction_factor=2)
|
||||
reporter = CLIReporter(
|
||||
# parameter_columns=["l1", "l2", "lr", "batch_size"],
|
||||
metric_columns=["loss", "accuracy", "training_iteration"])
|
||||
result = tune.run(
|
||||
partial(training, data_root=data_root, device=device),
|
||||
resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
|
||||
config=config,
|
||||
num_samples=num_samples,
|
||||
scheduler=scheduler,
|
||||
progress_reporter=reporter)
|
||||
|
||||
# Create data loaders.
|
||||
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
|
||||
test_dataloader = DataLoader(testing_data, batch_size=batch_size, shuffle=True)
|
||||
best_trial = result.get_best_trial("loss", "min", "last")
|
||||
print(f"Best trial config: {best_trial.config}")
|
||||
print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
|
||||
print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")
|
||||
|
||||
return train_dataloader, test_dataloader
|
||||
best_trained_model = NeuralNetwork(best_trial.config["l1"], best_trial.config["l2"])
|
||||
if is_available():
|
||||
if gpus_per_trial > 1:
|
||||
best_trained_model = nn.DataParallel(best_trained_model)
|
||||
best_trained_model.to(device)
|
||||
|
||||
best_checkpoint_dir = best_trial.checkpoint.value
|
||||
model_state, optimizer_state = load(join(
|
||||
best_checkpoint_dir, "checkpoint"))
|
||||
best_trained_model.load_state_dict(model_state)
|
||||
|
||||
# If Pytorch don't save the end
|
||||
print("In case saving...")
|
||||
save(best_trained_model, "/home/flifloo/IA/model.pth")
|
||||
|
||||
print("Testing accuracy...")
|
||||
print(f"Best trial test set accuracy: {test_accuracy(best_trained_model, data_root, device)}")
|
||||
|
||||
|
||||
# Define model
|
||||
class NeuralNetwork(nn.Module):
|
||||
def __init__(self):
|
||||
super(NeuralNetwork, self).__init__()
|
||||
self.conv_relu_stack = nn.Sequential(
|
||||
nn.Conv2d(3, 6, (5, 5)),
|
||||
nn.MaxPool2d(2, 2),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(6, 16, (5, 5)),
|
||||
nn.MaxPool2d(2, 2),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.linear_relu_stack = nn.Sequential(
|
||||
nn.Linear(16*(5**2), 120),
|
||||
nn.ReLU(),
|
||||
nn.Linear(120, 84),
|
||||
nn.ReLU(),
|
||||
nn.Linear(84, 10),
|
||||
nn.ReLU(),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_relu_stack(x)
|
||||
x = x.view(-1, 16 * 5 * 5)
|
||||
return self.linear_relu_stack(x)
|
||||
|
||||
|
||||
def train(dataloader, model, loss_fn, optimizer):
|
||||
size = len(dataloader.dataset)
|
||||
for batch, (X, y) in enumerate(dataloader):
|
||||
X, y = X.to(device), y.to(device)
|
||||
|
||||
# Compute prediction error
|
||||
pred = model(X)
|
||||
loss = loss_fn(pred, y)
|
||||
|
||||
# Backpropagation
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
if batch % 100 == 0:
|
||||
loss, current = loss.item(), batch * len(X)
|
||||
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
|
||||
|
||||
|
||||
def test(dataloader, model, loss_fn):
|
||||
size = len(dataloader.dataset)
|
||||
model.eval()
|
||||
test_loss, correct = 0, 0
|
||||
with torch.no_grad():
|
||||
for X, y in dataloader:
|
||||
X, y = X.to(device), y.to(device)
|
||||
pred = model(X)
|
||||
test_loss += loss_fn(pred, y).item()
|
||||
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
|
||||
test_loss /= size
|
||||
correct /= size
|
||||
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
|
||||
return correct
|
||||
|
||||
|
||||
def training():
|
||||
train_data, test_data = get_data()
|
||||
|
||||
model = NeuralNetwork().to(device)
|
||||
if isfile("model.pth"):
|
||||
print("Loading model from save")
|
||||
model.load_state_dict(torch.load("model.pth"))
|
||||
|
||||
print(model)
|
||||
|
||||
loss_fn = nn.CrossEntropyLoss()
|
||||
# lr = sur/sous appretisage
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=.9)
|
||||
|
||||
e = 0
|
||||
c = 0
|
||||
while c < 0.90:
|
||||
print(f"Epoch {e+1}\n-------------------------------")
|
||||
train(train_data, model, loss_fn, optimizer)
|
||||
c = test(test_data, model, loss_fn)
|
||||
torch.save(model.state_dict(), "model.pth")
|
||||
e += 1
|
||||
print("Done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
training()
|
||||
if __name__ == "__main__":
|
||||
main("/home/flifloo/IA/data")
|
||||
|
|
45
tests.py
Normal file
45
tests.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
from torch import no_grad, max
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from dataset import get_data
|
||||
|
||||
|
||||
def test(test_loader, net, criterion, device):
|
||||
val_loss = 0.0
|
||||
val_steps = 0
|
||||
total = 0
|
||||
correct = 0
|
||||
for i, data in enumerate(test_loader, 0):
|
||||
with no_grad():
|
||||
inputs, labels = data
|
||||
inputs, labels = inputs.to(device), labels.to(device)
|
||||
|
||||
outputs = net(inputs)
|
||||
_, predicted = max(outputs.data, 1)
|
||||
total += labels.size(0)
|
||||
correct += (predicted == labels).sum().item()
|
||||
|
||||
loss = criterion(outputs, labels)
|
||||
val_loss += loss.cpu().numpy()
|
||||
val_steps += 1
|
||||
return val_loss / val_steps, correct / total
|
||||
|
||||
|
||||
def test_accuracy(net, data_root, device):
|
||||
train_set, test_set = get_data(data_root)
|
||||
|
||||
test_loader = DataLoader(
|
||||
test_set, batch_size=4, shuffle=False, num_workers=2)
|
||||
|
||||
correct = 0
|
||||
total = 0
|
||||
with no_grad():
|
||||
for data in test_loader:
|
||||
images, labels = data
|
||||
images, labels = images.to(device), labels.to(device)
|
||||
outputs = net(images)
|
||||
_, predicted = max(outputs.data, 1)
|
||||
total += labels.size(0)
|
||||
correct += (predicted == labels).sum().item()
|
||||
|
||||
return correct / total
|
60
training.py
Normal file
60
training.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from os.path import join
|
||||
|
||||
from ray import tune
|
||||
from torch import save, load, nn
|
||||
from torch.optim import SGD
|
||||
|
||||
from NeuralNetwork import NeuralNetwork
|
||||
from dataset import load_data
|
||||
from tests import test
|
||||
|
||||
|
||||
def train(train_loader, net, optimizer, criterion, epoch, device):
|
||||
running_loss = 0.0
|
||||
epoch_steps = 0
|
||||
for i, data in enumerate(train_loader, 0):
|
||||
# get the inputs; data is a list of [inputs, labels]
|
||||
inputs, labels = data
|
||||
inputs, labels = inputs.to(device), labels.to(device)
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward + backward + optimize
|
||||
outputs = net(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# print statistics
|
||||
running_loss += loss.item()
|
||||
epoch_steps += 1
|
||||
if i % 2000 == 1999: # print every 2000 mini-batches
|
||||
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
|
||||
running_loss / epoch_steps))
|
||||
running_loss = 0.0
|
||||
|
||||
|
||||
def training(config, data_root, device="cpu", checkpoint_dir=None):
|
||||
net = NeuralNetwork(config["l1"], config["l2"]).to(device)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = SGD(net.parameters(), lr=config["lr"], momentum=0.9)
|
||||
|
||||
if checkpoint_dir:
|
||||
model_state, optimizer_state = load(
|
||||
join(checkpoint_dir, "checkpoint"))
|
||||
net.load_state_dict(model_state)
|
||||
optimizer.load_state_dict(optimizer_state)
|
||||
|
||||
train_loader, test_loader = load_data(config, data_root)
|
||||
|
||||
for epoch in range(10):
|
||||
train(train_loader, net, optimizer, criterion, epoch, device)
|
||||
loss, accuracy = test(test_loader, net, criterion, device)
|
||||
|
||||
with tune.checkpoint_dir(epoch) as checkpoint_dir:
|
||||
path = join(checkpoint_dir, "checkpoint")
|
||||
save((net.state_dict(), optimizer.state_dict()), path)
|
||||
tune.report(loss=loss, accuracy=accuracy)
|
||||
print("Finished Training")
|
Reference in a new issue