Created
May 17, 2023 13:51
-
-
Save monajalal/5b85c84db7c0f7a0f9250113da994d17 to your computer and use it in GitHub Desktop.
DDP version of DOPE train.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Copyright (c) 2018 NVIDIA Corporation. All rights reserved. | |
# This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. | |
# https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode | |
from __future__ import print_function | |
###################################################### | |
""" | |
HOW TO TRAIN DOPE | |
This is the DOPE training code. | |
It is provided as a convenience for researchers, but it is otherwise unsupported. | |
Please refer to `python3 train.py --help` for specific details about the | |
training code. | |
If you download the FAT dataset | |
(https://research.nvidia.com/publication/2018-06_Falling-Things) | |
you can train a YCB object DOPE detector as follows: | |
``` | |
python3 train.py --data path/to/FAT --object soup --checkpoints soup | |
--gpuids 0 1 2 3 4 5 6 7 | |
``` | |
This will create a folder called `train_soup` where the weights will be saved | |
after each epoch. It will use the 8 gpus using pytorch data parallel. | |
""" | |
import sys | |
import argparse | |
import random | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
import torch.nn.parallel | |
import torch.optim as optim | |
from torch.optim.lr_scheduler import CosineAnnealingLR | |
import torch.utils.data | |
import torchvision.transforms as transforms | |
from torch.utils.data.distributed import DistributedSampler | |
from torch.autograd import Variable | |
import torch.utils.data as data | |
import torchvision.models as models | |
import torch.distributed as dist | |
import datetime | |
import json | |
import glob | |
import os | |
from PIL import Image | |
from PIL import ImageDraw | |
from PIL import ImageEnhance | |
from math import acos | |
from math import sqrt | |
from math import pi | |
from os.path import exists | |
import cv2 | |
import colorsys | |
import inspect | |
from distutils.util import strtobool | |
import configparser | |
import logging | |
sys.path.append("/home/azureuser/dope/scripts/train2") | |
#from dope.utils import make_grid | |
from train2.utils_dope import make_grid | |
import warnings | |
import traceback | |
import platform | |
print(platform.python_implementation()) | |
print(platform.python_version()) | |
print(platform.uname()) | |
import mlflow | |
# https://github.com/Azure/medical-imaging/blob/main/3d-brain-tumor-segmentation/src/train-brats21.py | |
# Avoid flooding of debug messages in logs | |
logging.basicConfig(level=logging.WARNING) | |
logging.getLogger("requests").setLevel(logging.WARNING) | |
logging.getLogger("azureml").setLevel(logging.WARNING) | |
logging.getLogger("azure").setLevel(logging.WARNING) | |
logging.getLogger("azure.core").setLevel(logging.WARNING) | |
logging.getLogger("azure.mlflow").setLevel(logging.WARNING) | |
# # mlflow.autolog(silent=True) | |
mlflow.autolog() | |
script_path = os.getcwd() | |
print('training script path: ', script_path) | |
warnings.filterwarnings("ignore") | |
################################################## | |
# NEURAL NETWORK MODEL | |
################################################## | |
class DopeNetwork(nn.Module): | |
def __init__( | |
self, | |
pretrained=False, | |
numBeliefMap=9, | |
numAffinity=16, | |
stop_at_stage=6 # number of stages to process (if less than total number of stages) | |
): | |
super(DopeNetwork, self).__init__() | |
self.stop_at_stage = stop_at_stage | |
if pretrained is False: | |
print("Training network without imagenet weights.") | |
else: | |
print("Training network pretrained on imagenet.") | |
vgg_full = models.vgg19(pretrained=pretrained).features | |
self.vgg = nn.Sequential() | |
for i_layer in range(24): | |
self.vgg.add_module(str(i_layer), vgg_full[i_layer]) | |
# Add some layers | |
i_layer = 23 | |
self.vgg.add_module(str(i_layer), nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1)) | |
self.vgg.add_module(str(i_layer+1), nn.ReLU(inplace=True)) | |
self.vgg.add_module(str(i_layer+2), nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1)) | |
self.vgg.add_module(str(i_layer+3), nn.ReLU(inplace=True)) | |
# print('---Belief------------------------------------------------') | |
# _2 are the belief map stages | |
self.m1_2 = DopeNetwork.create_stage(128, numBeliefMap, True) | |
self.m2_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numBeliefMap, False) | |
self.m3_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numBeliefMap, False) | |
self.m4_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numBeliefMap, False) | |
self.m5_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numBeliefMap, False) | |
self.m6_2 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numBeliefMap, False) | |
# print('---Affinity----------------------------------------------') | |
# _1 are the affinity map stages | |
self.m1_1 = DopeNetwork.create_stage(128, numAffinity, True) | |
self.m2_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numAffinity, False) | |
self.m3_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numAffinity, False) | |
self.m4_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numAffinity, False) | |
self.m5_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numAffinity, False) | |
self.m6_1 = DopeNetwork.create_stage(128 + numBeliefMap + numAffinity, | |
numAffinity, False) | |
def forward(self, x): | |
'''Runs inference on the neural network''' | |
out1 = self.vgg(x) | |
out1_2 = self.m1_2(out1) | |
out1_1 = self.m1_1(out1) | |
if self.stop_at_stage == 1: | |
return [out1_2],\ | |
[out1_1] | |
out2 = torch.cat([out1_2, out1_1, out1], 1) | |
out2_2 = self.m2_2(out2) | |
out2_1 = self.m2_1(out2) | |
if self.stop_at_stage == 2: | |
return [out1_2, out2_2],\ | |
[out1_1, out2_1] | |
out3 = torch.cat([out2_2, out2_1, out1], 1) | |
out3_2 = self.m3_2(out3) | |
out3_1 = self.m3_1(out3) | |
if self.stop_at_stage == 3: | |
return [out1_2, out2_2, out3_2],\ | |
[out1_1, out2_1, out3_1] | |
out4 = torch.cat([out3_2, out3_1, out1], 1) | |
out4_2 = self.m4_2(out4) | |
out4_1 = self.m4_1(out4) | |
if self.stop_at_stage == 4: | |
return [out1_2, out2_2, out3_2, out4_2],\ | |
[out1_1, out2_1, out3_1, out4_1] | |
out5 = torch.cat([out4_2, out4_1, out1], 1) | |
out5_2 = self.m5_2(out5) | |
out5_1 = self.m5_1(out5) | |
if self.stop_at_stage == 5: | |
return [out1_2, out2_2, out3_2, out4_2, out5_2],\ | |
[out1_1, out2_1, out3_1, out4_1, out5_1] | |
out6 = torch.cat([out5_2, out5_1, out1], 1) | |
out6_2 = self.m6_2(out6) | |
out6_1 = self.m6_1(out6) | |
return [out1_2, out2_2, out3_2, out4_2, out5_2, out6_2],\ | |
[out1_1, out2_1, out3_1, out4_1, out5_1, out6_1] | |
@staticmethod | |
def create_stage(in_channels, out_channels, first=False): | |
'''Create the neural network layers for a single stage.''' | |
model = nn.Sequential() | |
mid_channels = 128 | |
if first: | |
padding = 1 | |
kernel = 3 | |
count = 6 | |
final_channels = 512 | |
else: | |
padding = 3 | |
kernel = 7 | |
count = 10 | |
final_channels = mid_channels | |
# First convolution | |
model.add_module("0", | |
nn.Conv2d( | |
in_channels, | |
mid_channels, | |
kernel_size=kernel, | |
stride=1, | |
padding=padding) | |
) | |
# Middle convolutions | |
i = 1 | |
while i < count - 1: | |
model.add_module(str(i), nn.ReLU(inplace=True)) | |
i += 1 | |
model.add_module(str(i), | |
nn.Conv2d( | |
mid_channels, | |
mid_channels, | |
kernel_size=kernel, | |
stride=1, | |
padding=padding)) | |
i += 1 | |
# Penultimate convolution | |
model.add_module(str(i), nn.ReLU(inplace=True)) | |
i += 1 | |
model.add_module(str(i), nn.Conv2d(mid_channels, final_channels, kernel_size=1, stride=1)) | |
i += 1 | |
# Last convolution | |
model.add_module(str(i), nn.ReLU(inplace=True)) | |
i += 1 | |
model.add_module(str(i), nn.Conv2d(final_channels, out_channels, kernel_size=1, stride=1)) | |
i += 1 | |
return model | |
################################################## | |
# UTILS CODE FOR LOADING THE DATA | |
################################################## | |
def default_loader(path): | |
return Image.open(path).convert('RGB') | |
def loadjson(path, objectofinterest): | |
""" | |
Loads the data from a json file. | |
If there are no objects of interest, then load all the objects. | |
""" | |
with open(path) as data_file: | |
data = json.load(data_file) | |
pointsBelief = [] | |
centroids = [] | |
translations = [] | |
rotations = [] | |
points = [] | |
for i_line in range(len(data['objects'])): | |
info = data['objects'][i_line] | |
if not objectofinterest is None and \ | |
not objectofinterest in info['class'].lower(): | |
continue | |
# 3d bbox with belief maps | |
points3d = [] | |
pointdata = info['projected_cuboid'] | |
for p in pointdata: | |
points3d.append((p[0], p[1])) | |
if len(points3d) == 8: | |
# NDDS format: 8 points in 'projected_cuboid', 1 point in 'projected_cuboid_centroid' | |
pcenter = info['projected_cuboid_centroid'] | |
points3d.append((pcenter[0], pcenter[1])) | |
elif len(points3d) == 9: | |
# nvisii format: 9 points in 'projected_cuboid', no 'projected_cuboid_centroid' key | |
pcenter = points3d[-1] | |
else: | |
raise RuntimeError(f'projected_cuboid has to have 8 or 9 points while reading "{path}"') | |
pointsBelief.append(points3d) | |
points.append(points3d + [(pcenter[0], pcenter[1])]) # NOTE: Adding the centroid again is probably a bug. | |
centroids.append((pcenter[0], pcenter[1])) | |
# load translations | |
location = info['location'] | |
translations.append([location[0], location[1], location[2]]) | |
# quaternion | |
rot = info["quaternion_xyzw"] | |
rotations.append(rot) | |
return { | |
"pointsBelief": pointsBelief, | |
"rotations": rotations, | |
"translations": translations, | |
"centroids": centroids, | |
"points": points, | |
"keypoints_2d": [], | |
} | |
def loadimages(root): | |
""" | |
Find all the images in the path and folders, return them in imgs. | |
""" | |
imgs = [] | |
def add_json_files(path,): | |
for imgpath in glob.glob(path+"/*.png"): | |
if exists(imgpath) and exists(imgpath.replace('png',"json")): | |
imgs.append((imgpath,imgpath.replace(path,"").replace("/",""), | |
imgpath.replace('png',"json"))) | |
for imgpath in glob.glob(path+"/*.jpg"): | |
if exists(imgpath) and exists(imgpath.replace('jpg',"json")): | |
imgs.append((imgpath,imgpath.replace(path,"").replace("/",""), | |
imgpath.replace('jpg',"json"))) | |
def explore(path): | |
if not os.path.isdir(path): | |
return | |
folders = [os.path.join(path, o) for o in os.listdir(path) | |
if os.path.isdir(os.path.join(path,o))] | |
if len(folders)>0: | |
for path_entry in folders: | |
explore(path_entry) | |
add_json_files(path) | |
explore(root) | |
return imgs | |
class MultipleVertexJson(data.Dataset): | |
""" | |
Dataloader for the data generated by NDDS (https://github.com/NVIDIA/Dataset_Synthesizer). | |
This is the same data as the data used in FAT. | |
""" | |
def __init__(self, root,transform=None, nb_vertex = 8, | |
keep_orientation = True, | |
normal = None, test=False, | |
target_transform = None, | |
loader = default_loader, | |
objectofinterest = "", | |
img_size = 400, | |
save = False, | |
noise = 2, | |
data_size = None, | |
sigma = 16, | |
random_translation = (25.0,25.0), | |
random_rotation = 15.0, | |
): | |
################### | |
self.objectofinterest = objectofinterest | |
self.img_size = img_size | |
self.loader = loader | |
self.transform = transform | |
self.target_transform = target_transform | |
self.root = root | |
self.imgs = [] | |
self.test = test | |
self.normal = normal | |
self.keep_orientation = keep_orientation | |
self.save = save | |
self.noise = noise | |
self.data_size = data_size | |
self.sigma = sigma | |
self.random_translation = random_translation | |
self.random_rotation = random_rotation | |
def load_data(path): | |
'''Recursively load the data. This is useful to load all of the FAT dataset.''' | |
imgs = loadimages(path) | |
# Check all the folders in path | |
for name in os.listdir(str(path)): | |
imgs += loadimages(path +"/"+name) | |
return imgs | |
self.imgs = load_data(root) | |
# Shuffle the data, this is useful when we want to use a subset. | |
np.random.shuffle(self.imgs) | |
def __len__(self): | |
# When limiting the number of data | |
if not self.data_size is None: | |
return int(self.data_size) | |
return len(self.imgs) | |
def __getitem__(self, index): | |
""" | |
Depending on how the data loader is configured, | |
this will return the debug info with the cuboid drawn on it, | |
this happens when self.save is set to true. | |
Otherwise, during training this function returns the | |
belief maps and affinity fields and image as tensors. | |
""" | |
path, name, txt = self.imgs[index] | |
img = self.loader(path) | |
img_size = img.size | |
img_size = (400,400) | |
loader = loadjson | |
data = loader(txt, self.objectofinterest) | |
pointsBelief = data['pointsBelief'] | |
objects_centroid = data['centroids'] | |
points_all = data['points'] | |
points_keypoints = data['keypoints_2d'] | |
translations = torch.from_numpy(np.array( | |
data['translations'])).float() | |
rotations = torch.from_numpy(np.array( | |
data['rotations'])).float() | |
if len(points_all) == 0: | |
points_all = torch.zeros(1, 10, 2).double() | |
# self.save == true assumes there is only | |
# one object instance in the scene. | |
if translations.size()[0] > 1: | |
translations = translations[0].unsqueeze(0) | |
rotations = rotations[0].unsqueeze(0) | |
# If there are no objects, still need to return similar shape array | |
if len(translations) == 0: | |
translations = torch.zeros(1,3).float() | |
rotations = torch.zeros(1,4).float() | |
# Camera intrinsics | |
path_cam = path.replace(name,'_camera_settings.json') | |
with open(path_cam) as data_file: | |
data = json.load(data_file) | |
# Assumes one camera | |
cam = data['camera_settings'][0]['intrinsic_settings'] | |
matrix_camera = np.zeros((3,3)) | |
matrix_camera[0,0] = cam['fx'] | |
matrix_camera[1,1] = cam['fy'] | |
matrix_camera[0,2] = cam['cx'] | |
matrix_camera[1,2] = cam['cy'] | |
matrix_camera[2,2] = 1 | |
# Load the cuboid sizes | |
path_set = path.replace(name,'_object_settings.json') | |
with open(path_set) as data_file: | |
data = json.load(data_file) | |
cuboid = torch.zeros(1) | |
if self.objectofinterest is None: | |
cuboid = np.array(data['exported_objects'][0]['cuboid_dimensions']) | |
else: | |
for info in data["exported_objects"]: | |
if self.objectofinterest in info['class']: | |
cuboid = np.array(info['cuboid_dimensions']) | |
img_original = img.copy() | |
def Reproject(points,tm, rm): | |
""" | |
Reprojection of points when rotating the image | |
""" | |
proj_cuboid = np.array(points) | |
rmat = np.identity(3) | |
rmat[0:2] = rm | |
tmat = np.identity(3) | |
tmat[0:2] = tm | |
new_cuboid = np.matmul( | |
rmat, np.vstack((proj_cuboid.T, np.ones(len(points))))) | |
new_cuboid = np.matmul(tmat, new_cuboid) | |
new_cuboid = new_cuboid[0:2].T | |
return new_cuboid | |
# Random image manipulation, rotation and translation with zero padding | |
# These create a bug, thank you to | |
# https://tanelp.github.io/posts/a-bug-that-plagues-thousands-of-open-source-ml-projects/ | |
# dx = round(np.random.normal(0, 2) * float(self.random_translation[0])) | |
# dy = round(np.random.normal(0, 2) * float(self.random_translation[1])) | |
# angle = round(np.random.normal(0, 1) * float(self.random_rotation)) | |
dx = round(float(torch.normal(torch.tensor(0.0), torch.tensor(2.0)) * float(self.random_translation[0]))) | |
dy = round(float(torch.normal(torch.tensor(0.0), torch.tensor(2.0)) * float(self.random_translation[1]))) | |
angle = round(float(torch.normal(torch.tensor(0.0), torch.tensor(1.0)) * float(self.random_rotation))) | |
tm = np.float32([[1, 0, dx], [0, 1, dy]]) | |
rm = cv2.getRotationMatrix2D( | |
(img.size[0]/2, img.size[1]/2), angle, 1) | |
for i_objects in range(len(pointsBelief)): | |
points = pointsBelief[i_objects] | |
new_cuboid = Reproject(points, tm, rm) | |
pointsBelief[i_objects] = new_cuboid.tolist() | |
objects_centroid[i_objects] = tuple(new_cuboid.tolist()[-1]) | |
pointsBelief[i_objects] = list(map(tuple, pointsBelief[i_objects])) | |
for i_objects in range(len(points_keypoints)): | |
points = points_keypoints[i_objects] | |
new_cuboid = Reproject(points, tm, rm) | |
points_keypoints[i_objects] = new_cuboid.tolist() | |
points_keypoints[i_objects] = list(map(tuple, points_keypoints[i_objects])) | |
image_r = cv2.warpAffine(np.array(img), rm, img.size) | |
result = cv2.warpAffine(image_r, tm, img.size) | |
img = Image.fromarray(result) | |
# Note: All point coordinates are in the image space, e.g., pixel value. | |
# This is used when we do saving --- helpful for debugging | |
if self.save or self.test: | |
# Use the save to debug the data | |
if self.test: | |
draw = ImageDraw.Draw(img_original) | |
else: | |
draw = ImageDraw.Draw(img) | |
# PIL drawing functions, here for sharing draw | |
def DrawKeypoints(points): | |
for key in points: | |
DrawDot(key,(12, 115, 170),7) | |
def DrawLine(point1, point2, lineColor, lineWidth): | |
if not point1 is None and not point2 is None: | |
draw.line([point1,point2],fill=lineColor,width=lineWidth) | |
def DrawDot(point, pointColor, pointRadius): | |
if not point is None: | |
xy = [point[0]-pointRadius, point[1]-pointRadius, point[0]+pointRadius, point[1]+pointRadius] | |
draw.ellipse(xy, fill=pointColor, outline=pointColor) | |
def DrawCube(points, which_color = 0, color = None): | |
'''Draw cube with a thick solid line across the front top edge.''' | |
lineWidthForDrawing = 2 | |
lineColor1 = (255, 215, 0) # yellow-ish | |
lineColor2 = (12, 115, 170) # blue-ish | |
lineColor3 = (45, 195, 35) # green-ish | |
if which_color == 3: | |
lineColor = lineColor3 | |
else: | |
lineColor = lineColor1 | |
if not color is None: | |
lineColor = color | |
# draw front | |
DrawLine(points[0], points[1], lineColor, 8) #lineWidthForDrawing) | |
DrawLine(points[1], points[2], lineColor, lineWidthForDrawing) | |
DrawLine(points[3], points[2], lineColor, lineWidthForDrawing) | |
DrawLine(points[3], points[0], lineColor, lineWidthForDrawing) | |
# draw back | |
DrawLine(points[4], points[5], lineColor, lineWidthForDrawing) | |
DrawLine(points[6], points[5], lineColor, lineWidthForDrawing) | |
DrawLine(points[6], points[7], lineColor, lineWidthForDrawing) | |
DrawLine(points[4], points[7], lineColor, lineWidthForDrawing) | |
# draw sides | |
DrawLine(points[0], points[4], lineColor, lineWidthForDrawing) | |
DrawLine(points[7], points[3], lineColor, lineWidthForDrawing) | |
DrawLine(points[5], points[1], lineColor, lineWidthForDrawing) | |
DrawLine(points[2], points[6], lineColor, lineWidthForDrawing) | |
# draw dots | |
DrawDot(points[0], pointColor=(255,255,255), pointRadius = 3) | |
DrawDot(points[1], pointColor=(0,0,0), pointRadius = 3) | |
# Draw all the found objects. | |
for points_belief_objects in pointsBelief: | |
DrawCube(points_belief_objects) | |
for keypoint in points_keypoints: | |
DrawKeypoints(keypoint) | |
img = self.transform(img) | |
return { | |
"img":img, | |
"translations":translations, | |
"rot_quaternions":rotations, | |
'pointsBelief':np.array(points_all[0]), | |
'matrix_camera':matrix_camera, | |
'img_original': np.array(img_original), | |
'cuboid': cuboid, | |
'file_name':name, | |
} | |
# Create the belief map | |
beliefsImg = CreateBeliefMap( | |
img, | |
pointsBelief=pointsBelief, | |
nbpoints = 9, | |
sigma = self.sigma) | |
# Create the image maps for belief | |
transform = transforms.Compose([transforms.Resize(min(img_size))]) | |
totensor = transforms.Compose([transforms.ToTensor()]) | |
for j in range(len(beliefsImg)): | |
beliefsImg[j] = self.target_transform(beliefsImg[j]) | |
# beliefsImg[j].save('{}.png'.format(j)) | |
beliefsImg[j] = totensor(beliefsImg[j]) | |
beliefs = torch.zeros((len(beliefsImg),beliefsImg[0].size(1),beliefsImg[0].size(2))) | |
for j in range(len(beliefsImg)): | |
beliefs[j] = beliefsImg[j][0] | |
# Create affinity maps | |
scale = 8 | |
if min (img.size) / 8.0 != min (img_size)/8.0: | |
# print (scale) | |
scale = min (img.size)/(min (img_size)/8.0) | |
affinities = GenerateMapAffinity(img,8,pointsBelief,objects_centroid,scale) | |
img = self.transform(img) | |
# Transform the images for training input | |
w_crop = np.random.randint(0, img.size[0] - img_size[0]+1) | |
h_crop = np.random.randint(0, img.size[1] - img_size[1]+1) | |
transform = transforms.Compose([transforms.Resize(min(img_size))]) | |
totensor = transforms.Compose([transforms.ToTensor()]) | |
if not self.normal is None: | |
normalize = transforms.Compose([transforms.Normalize | |
((self.normal[0],self.normal[0],self.normal[0]), | |
(self.normal[1],self.normal[1],self.normal[1])), | |
AddNoise(self.noise)]) | |
else: | |
normalize = transforms.Compose([AddNoise(0.0001)]) | |
img = crop(img,h_crop,w_crop,img_size[1],img_size[0]) | |
img = totensor(img) | |
img = normalize(img) | |
w_crop = int(w_crop/8) | |
h_crop = int(h_crop/8) | |
affinities = affinities[:,h_crop:h_crop+int(img_size[1]/8),w_crop:w_crop+int(img_size[0]/8)] | |
beliefs = beliefs[:,h_crop:h_crop+int(img_size[1]/8),w_crop:w_crop+int(img_size[0]/8)] | |
if affinities.size()[1] == 49 and not self.test: | |
affinities = torch.cat([affinities,torch.zeros(16,1,50)],dim=1) | |
if affinities.size()[2] == 49 and not self.test: | |
affinities = torch.cat([affinities,torch.zeros(16,50,1)],dim=2) | |
return { | |
'img':img, | |
"affinities":affinities, | |
'beliefs':beliefs, | |
} | |
""" | |
Some simple vector math functions to find the angle | |
between two points, used by affinity fields. | |
""" | |
def length(v): | |
return sqrt(v[0]**2+v[1]**2) | |
def dot_product(v,w): | |
return v[0]*w[0]+v[1]*w[1] | |
def normalize(v): | |
norm=np.linalg.norm(v, ord=1) | |
if norm==0: | |
norm=np.finfo(v.dtype).eps | |
return v/norm | |
def determinant(v,w): | |
return v[0]*w[1]-v[1]*w[0] | |
def inner_angle(v,w): | |
cosx=dot_product(v,w)/(length(v)*length(w)) | |
rad=acos(cosx) # in radians | |
return rad*180/pi # returns degrees | |
def py_ang(A, B=(1,0)): | |
inner=inner_angle(A,B) | |
det = determinant(A,B) | |
if det<0: #this is a property of the det. If the det < 0 then B is clockwise of A | |
return inner | |
else: # if the det > 0 then A is immediately clockwise of B | |
return 360-inner | |
def GenerateMapAffinity(img,nb_vertex,pointsInterest,objects_centroid,scale): | |
""" | |
Function to create the affinity maps, | |
e.g., vector maps pointing toward the object center. | |
Args: | |
img: PIL image | |
nb_vertex: (int) number of points | |
pointsInterest: list of points | |
objects_centroid: (x,y) centroids for the obects | |
scale: (float) by how much you need to scale down the image | |
return: | |
return a list of tensors for each point except centroid point | |
""" | |
# Apply the downscale right now, so the vectors are correct. | |
img_affinity = Image.new(img.mode, (int(img.size[0]/scale),int(img.size[1]/scale)), "black") | |
# Create the empty tensors | |
totensor = transforms.Compose([transforms.ToTensor()]) | |
affinities = [] | |
for i_points in range(nb_vertex): | |
affinities.append(torch.zeros(2,int(img.size[1]/scale),int(img.size[0]/scale))) | |
for i_pointsImage in range(len(pointsInterest)): | |
pointsImage = pointsInterest[i_pointsImage] | |
center = objects_centroid[i_pointsImage] | |
for i_points in range(nb_vertex): | |
point = pointsImage[i_points] | |
affinity_pair, img_affinity = getAfinityCenter(int(img.size[0]/scale), | |
int(img.size[1]/scale), | |
tuple((np.array(pointsImage[i_points])/scale).tolist()), | |
tuple((np.array(center)/scale).tolist()), | |
img_affinity = img_affinity, radius=1) | |
affinities[i_points] = (affinities[i_points] + affinity_pair)/2 | |
# Normalizing | |
v = affinities[i_points].numpy() | |
xvec = v[0] | |
yvec = v[1] | |
norms = np.sqrt(xvec * xvec + yvec * yvec) | |
nonzero = norms > 0 | |
xvec[nonzero]/=norms[nonzero] | |
yvec[nonzero]/=norms[nonzero] | |
affinities[i_points] = torch.from_numpy(np.concatenate([[xvec],[yvec]])) | |
affinities = torch.cat(affinities,0) | |
return affinities | |
def getAfinityCenter(width, height, point, center, radius=7, img_affinity=None): | |
""" | |
Function to create the affinity maps, | |
e.g., vector maps pointing toward the object center. | |
Args: | |
width: image wight | |
height: image height | |
point: (x,y) | |
center: (x,y) | |
radius: pixel radius | |
img_affinity: tensor to add to | |
return: | |
return a tensor | |
""" | |
tensor = torch.zeros(2,height,width).float() | |
# Create the canvas for the afinity output | |
imgAffinity = Image.new("RGB", (width,height), "black") | |
totensor = transforms.Compose([transforms.ToTensor()]) | |
draw = ImageDraw.Draw(imgAffinity) | |
r1 = radius | |
p = point | |
draw.ellipse((p[0]-r1,p[1]-r1,p[0]+r1,p[1]+r1),(255,255,255)) | |
del draw | |
# Compute the array to add the afinity | |
array = (np.array(imgAffinity)/255)[:,:,0] | |
angle_vector = np.array(center) - np.array(point) | |
angle_vector = normalize(angle_vector) | |
affinity = np.concatenate([[array*angle_vector[0]],[array*angle_vector[1]]]) | |
# print (tensor) | |
if not img_affinity is None: | |
# Find the angle vector | |
# print (angle_vector) | |
if length(angle_vector) >0: | |
angle=py_ang(angle_vector) | |
else: | |
angle = 0 | |
# print(angle) | |
c = np.array(colorsys.hsv_to_rgb(angle/360,1,1)) * 255 | |
draw = ImageDraw.Draw(img_affinity) | |
draw.ellipse((p[0]-r1,p[1]-r1,p[0]+r1,p[1]+r1),fill=(int(c[0]),int(c[1]),int(c[2]))) | |
del draw | |
re = torch.from_numpy(affinity).float() + tensor | |
return re, img_affinity | |
def CreateBeliefMap(img,pointsBelief,nbpoints,sigma=16): | |
""" | |
Args: | |
img: image | |
pointsBelief: list of points in the form of | |
[nb object, nb points, 2 (x,y)] | |
nbpoints: (int) number of points, DOPE uses 8 points here | |
sigma: (int) size of the belief map point | |
return: | |
return an array of PIL black and white images representing the | |
belief maps | |
""" | |
beliefsImg = [] | |
sigma = int(sigma) | |
for numb_point in range(nbpoints): | |
array = np.zeros(img.size) | |
out = np.zeros(img.size) | |
for point in pointsBelief: | |
p = point[numb_point] | |
w = int(sigma*2) | |
if p[0]-w>=0 and p[0]+w<img.size[0] and p[1]-w>=0 and p[1]+w<img.size[1]: | |
for i in range(int(p[0])-w, int(p[0])+w): | |
for j in range(int(p[1])-w, int(p[1])+w): | |
array[i,j] = np.exp(-(((i - p[0])**2 + (j - p[1])**2)/(2*(sigma**2)))) | |
stack = np.stack([array,array,array],axis=0).transpose(2,1,0) | |
imgBelief = Image.new(img.mode, img.size, "black") | |
beliefsImg.append(Image.fromarray((stack*255).astype('uint8'))) | |
return beliefsImg | |
def crop(img, i, j, h, w): | |
""" | |
Crop the given PIL.Image. | |
Args: | |
img (PIL.Image): Image to be cropped. | |
i: Upper pixel coordinate. | |
j: Left pixel coordinate. | |
h: Height of the cropped image. | |
w: Width of the cropped image. | |
Returns: | |
PIL.Image: Cropped image. | |
""" | |
return img.crop((j, i, j + w, i + h)) | |
class AddRandomContrast(object): | |
""" | |
Apply some random contrast from PIL | |
""" | |
def __init__(self,sigma=0.1): | |
self.sigma = sigma | |
def __call__(self, im): | |
contrast = ImageEnhance.Contrast(im) | |
im = contrast.enhance( np.random.normal(1,self.sigma) ) | |
return im | |
class AddRandomBrightness(object): | |
""" | |
Apply some random brightness from PIL | |
""" | |
def __init__(self,sigma=0.1): | |
self.sigma = sigma | |
def __call__(self, im): | |
bright = ImageEnhance.Brightness(im) | |
im = bright.enhance( np.random.normal(1,self.sigma) ) | |
return im | |
class AddNoise(object): | |
""" | |
Given mean: (R, G, B) and std: (R, G, B), | |
will normalize each channel of the torch.*Tensor, i.e. | |
channel = (channel - mean) / std | |
""" | |
def __init__(self,std=0.1): | |
self.std = std | |
def __call__(self, tensor): | |
# TODO: make efficient | |
# t = torch.FloatTensor(tensor.size()).uniform_(self.min,self.max) | |
t = torch.FloatTensor(tensor.size()).normal_(0,self.std) | |
t = tensor.add(t) | |
t = torch.clamp(t,-1,1) #this is expansive | |
return t | |
def save_image(tensor, filename, nrow=4, padding=2,mean=None, std=None): | |
""" | |
Saves a given Tensor into an image file. | |
If given a mini-batch tensor, will save the tensor as a grid of images. | |
""" | |
from PIL import Image | |
tensor = tensor.cpu() | |
grid = make_grid(tensor, nrow=nrow, padding=10,pad_value=1) | |
if not mean is None: | |
ndarr = grid.mul(std).add(mean).mul(255).byte().transpose(0,2).transpose(0,1).numpy() | |
else: | |
ndarr = grid.mul(0.5).add(0.5).mul(255).byte().transpose(0,2).transpose(0,1).numpy() | |
im = Image.fromarray(ndarr) | |
im.save(filename) | |
def DrawLine(point1, point2, lineColor, lineWidth,draw): | |
if not point1 is None and not point2 is None: | |
draw.line([point1,point2],fill=lineColor,width=lineWidth) | |
def DrawDot(point, pointColor, pointRadius, draw): | |
if not point is None: | |
xy = [point[0]-pointRadius, point[1]-pointRadius, point[0]+pointRadius, point[1]+pointRadius] | |
draw.ellipse(xy, fill=pointColor, outline=pointColor) | |
def DrawCube(points, which_color = 0, color = None, draw = None): | |
'''Draw cube with a thick solid line across the front top edge.''' | |
lineWidthForDrawing = 2 | |
lineWidthThick = 8 | |
lineColor1 = (255, 215, 0) # yellow-ish | |
lineColor2 = (12, 115, 170) # blue-ish | |
lineColor3 = (45, 195, 35) # green-ish | |
if which_color == 3: | |
lineColor = lineColor3 | |
else: | |
lineColor = lineColor1 | |
if not color is None: | |
lineColor = color | |
# draw front | |
DrawLine(points[0], points[1], lineColor, lineWidthThick, draw) | |
DrawLine(points[1], points[2], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[3], points[2], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[3], points[0], lineColor, lineWidthForDrawing, draw) | |
# draw back | |
DrawLine(points[4], points[5], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[6], points[5], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[6], points[7], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[4], points[7], lineColor, lineWidthForDrawing, draw) | |
# draw sides | |
DrawLine(points[0], points[4], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[7], points[3], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[5], points[1], lineColor, lineWidthForDrawing, draw) | |
DrawLine(points[2], points[6], lineColor, lineWidthForDrawing, draw) | |
# draw dots | |
DrawDot(points[0], pointColor=lineColor, pointRadius = 4,draw = draw) | |
DrawDot(points[1], pointColor=lineColor, pointRadius = 4,draw = draw) | |
################################################## | |
# TRAINING CODE MAIN STARTING HERE | |
################################################## | |
print ("start:" , datetime.datetime.now().time()) | |
conf_parser = argparse.ArgumentParser( | |
description=__doc__, # printed with -h/--help | |
# Don't mess with format of description | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
# Turn off help, so we print all options in response to -h | |
add_help=False | |
) | |
conf_parser.add_argument("-c", "--config", | |
help="Specify config file", metavar="FILE") | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--data', | |
default = "", | |
help='path to training data') | |
parser.add_argument('--datatest', | |
default="", | |
help='path to data testing set') | |
parser.add_argument('--object', | |
default="cracker", | |
help='In the dataset which objet of interest') | |
parser.add_argument('--num_workers', | |
type=int, | |
default=8, | |
help='number of data loading workers') | |
parser.add_argument('--prefetch_factor', | |
type=int, | |
default=2, | |
required=False, | |
help='Data loader prefetch factor') | |
parser.add_argument('--persistent_workers', | |
type=strtobool, | |
default=True, | |
required=False, | |
help='Use persistent prefetching workers') | |
parser.add_argument('--pin_memory', | |
type=strtobool, | |
default=True, | |
required=False, | |
help='Use persistent prefetching workers') | |
parser.add_argument('--non_blocking', | |
type=strtobool, | |
default=False, | |
required=False, | |
help='Use non-blocking transfer to device') | |
parser.add_argument('--enable_profiling', | |
type=strtobool, | |
default=False, | |
required=False, | |
help='Enable PyTorch profiler') | |
parser.add_argument('--batch_size', | |
type=int, | |
default=64, | |
required=False, | |
help='input batch size') | |
parser.add_argument('--imagesize', | |
type=int, | |
default=400, | |
help='the height / width of the input image to network') | |
parser.add_argument('--model_arch', | |
type=str, | |
required=False, | |
default='resnet18', | |
help='which model architecture to use') | |
parser.add_argument('--learning_rate', | |
type=float, | |
default=0.0001, | |
help='learning rate, default=0.001') | |
parser.add_argument('--momentum', | |
type=float, | |
default=0.9, | |
help='Momentum for the optimizer') | |
parser.add_argument('--noise', | |
type=float, | |
default=2.0, | |
help='gaussian noise added to the image') | |
parser.add_argument('--net', | |
default='', | |
help="path to net (to continue training)") | |
parser.add_argument('--namefile', | |
default='epoch', | |
help="name to put on the file of the save weights") | |
parser.add_argument('--manualseed', | |
type=int, | |
help='manual seed') | |
parser.add_argument('--register_model_as', | |
type=str, | |
default=None, | |
required=False, | |
help='Name to register the final model in MLFlow') | |
parser.add_argument('--num_epochs', | |
type=int, | |
default=1, | |
help="number of epochs to train") | |
parser.add_argument('--loginterval', | |
type=int, | |
default=100) | |
parser.add_argument('--gpuids', | |
nargs='+', | |
type=int, | |
default=[0,1,2,3], | |
help='GPUs to use') | |
parser.add_argument('--distributed_backend', | |
type=str, | |
default="nccl", | |
choices=["nccl", "mpi"], | |
required=False, | |
help='Which distributed backend to use') | |
parser.add_argument('--checkpoints', | |
type=str, | |
default=None, | |
required=False, | |
help='Path to read/write checkpoints') | |
parser.add_argument('--sigma', | |
default=4, | |
help='keypoint creation size for sigma') | |
parser.add_argument('--save', | |
action="store_true", | |
help='save a visual batch and quit, this is for\ | |
debugging purposes') | |
parser.add_argument("--model_arch_pretrained", | |
type=strtobool, | |
default=True, | |
required=False, | |
help='Use pretrained model') | |
parser.add_argument('--nbupdates', | |
default=None, | |
help='nb max update to network, overwrites the epoch number\ | |
otherwise uses the number of epochs') | |
parser.add_argument('--datasize', | |
default=None, | |
help='randomly sample that number of entries in the dataset folder') | |
# Read the config but do not overwrite the args written | |
args, remaining_argv = conf_parser.parse_known_args() | |
defaults = { "option":"default" } | |
if args.config: | |
config = configparser.ConfigParser.SafeConfigParser() | |
config.read([args.config]) | |
defaults.update(dict(config.items("defaults"))) | |
parser.set_defaults(**defaults) | |
parser.add_argument("--option") | |
opt = parser.parse_args(remaining_argv) | |
if opt.model_arch_pretrained in ['false', 'False']: | |
opt.model_arch_pretrained = False | |
if not "/" in opt.checkpoints: | |
opt.checkpoints = "train_{}".format(opt.checkpoints) | |
print("object of interest is: {}".format(opt.object)) | |
try: | |
os.makedirs(opt.checkpoints, exist_ok=True) | |
except OSError as e: | |
print("Error creating folder {}", e) | |
pass | |
if opt.manualseed is None: | |
opt.manualseed = random.randint(1, 10000) | |
print("manual seed set to {}".format(opt.manualseed)) | |
# set the manual seed for reproducibility | |
random.seed(opt.manualseed) | |
np.random.seed(opt.manualseed) | |
torch.manual_seed(opt.manualseed) | |
torch.cuda.manual_seed_all(opt.manualseed) | |
# local_rank = 0 | |
local_rank = os.environ["LOCAL_RANK"] | |
self_is_main_node = False | |
logger = logging.getLogger(__name__) | |
print("opt.checkpoints = {}".format(opt.checkpoints)) | |
# save the hyper parameters passed | |
distributed_backend = opt.distributed_backend | |
if distributed_backend == "nccl": | |
world_size = int(os.environ["WORLD_SIZE"]) | |
world_rank = int(os.environ['RANK']) | |
local_rank = int(os.environ["LOCAL_RANK"]) | |
multinode_available = world_size > 1 | |
self_is_main_node = world_rank == 0 | |
print("world size is: ", world_size) | |
print("global rank is {} and local_rank is {}".format(world_rank, local_rank)) | |
else: | |
raise NotImplementedError( | |
f"distributed_backend={distributed_backend} is not implemented yet." | |
) | |
if self_is_main_node: | |
with open (opt.checkpoints+'/header.txt','w') as file: | |
file.write(str(opt)+"\n") | |
with open (opt.checkpoints+'/header.txt','w') as file: | |
file.write(str(opt)) | |
file.write("seed: "+ str(opt.manualseed)+'\n') | |
with open (opt.checkpoints+'/test_metric.csv','w') as file: | |
file.write("epoch, passed,total \n") | |
dist_url = "env://" # default | |
# dist_url = "auto" | |
is_distributed = world_size > 1 | |
if is_distributed: | |
batch_size = opt.batch_size // world_size | |
batch_size = max(batch_size, 1) | |
else: | |
batch_size = opt.batch_size | |
print("is_distributed is {} and batch_size is {}".format(is_distributed, batch_size)) | |
env_dict = { | |
key: os.environ[key] | |
for key in ("MASTER_ADDR", "MASTER_PORT","LOCAL_RANK", "RANK", "WORLD_SIZE") | |
} | |
print("os.getpid() is {} and initializing process group with {}".format(os.getpid(), env_dict)) | |
logger.info("os.getpid() is {} and initializing process group with {}".format(os.getpid(), env_dict)) | |
# DISTRIBUTED: this is required to initialize the pytorch backend | |
dist.init_process_group( | |
backend=distributed_backend, | |
init_method=dist_url, | |
timeout=datetime.timedelta(seconds=2000), #default is 30 min | |
world_size=world_size, | |
rank=world_rank # so should it be RANK or WORLD_RANK | |
) | |
torch.cuda.set_device(local_rank) | |
device = torch.device("cuda", local_rank) if torch.cuda.is_available() else 'cpu' | |
print("device is {}".format(device)) | |
# this will make all .cuda() calls work properly | |
# synchronize all the threads to reach this point before moving on | |
dist.barrier() # is this necessary? | |
# DISTRIBUTED: in distributed mode, you want to report parameters | |
# only from main process (rank==0) to avoid conflict | |
# if self_is_main_node: | |
# MLFLOW: report relevant parameters using mlflow | |
logged_params = { | |
"instance_per_node": world_size, | |
"cuda_available": torch.cuda.is_available(), | |
"distributed": multinode_available, | |
"distributed_backend": distributed_backend, | |
# data loading params | |
"batch_size": batch_size, | |
"num_epochs": opt.num_epochs, | |
"num_workers": opt.num_workers, | |
"cpu_count": os.cpu_count(), | |
"prefetch_factor": opt.prefetch_factor, | |
"persistent_workers": opt.persistent_workers, | |
"pin_memory": opt.pin_memory, | |
"non_blocking": opt.non_blocking, | |
# "multiprocessing_sharing_strategy": opt.multiprocessing_sharing_strategy, | |
# training params | |
"model_arch": opt.model_arch, | |
"model_arch_pretrained": opt.model_arch_pretrained, | |
"optimizer.learning_rate": opt.learning_rate, | |
# profiling params | |
"enable_profiling": opt.enable_profiling, | |
} | |
if torch.cuda.is_available(): | |
# add some gpu properties | |
logged_params["cuda_device_count"] = torch.cuda.device_count() | |
cuda_device_properties = torch.cuda.get_device_properties(device) | |
logged_params["cuda_device_name"] = cuda_device_properties.name | |
logged_params["cuda_device_major"] = cuda_device_properties.major | |
logged_params["cuda_device_minor"] = cuda_device_properties.minor | |
logged_params[ | |
"cuda_device_memory" | |
] = cuda_device_properties.total_memory | |
logged_params[ | |
"cuda_device_processor_count" | |
] = cuda_device_properties.multi_processor_count | |
print("MLflow version:", mlflow.__version__) | |
print("Tracking URI:", mlflow.get_tracking_uri()) | |
print("Artifact URI:", mlflow.get_artifact_uri()) | |
tags = {"team": "ARD", | |
"dataset": "FAT120K", | |
"model": "pose"} | |
exp_name = "dope6d" | |
if self_is_main_node: | |
mlflow.set_experiment(experiment_name=exp_name) | |
mlflow.set_tags(tags) | |
mlflow.log_params(logged_params) | |
# save | |
if not opt.save: | |
contrast = 0.2 | |
brightness = 0.2 | |
noise = 0.1 | |
normal_imgs = [0.59,0.25] | |
transform = transforms.Compose([ | |
AddRandomContrast(contrast), | |
AddRandomBrightness(brightness), | |
transforms.Resize(opt.imagesize), | |
]) | |
else: | |
contrast = 0.00001 | |
brightness = 0.00001 | |
noise = 0.00001 | |
normal_imgs = None | |
transform = transforms.Compose([ | |
transforms.Resize(opt.imagesize), | |
transforms.ToTensor()]) | |
print ("load data") | |
#load the dataset using the loader in utils_pose | |
optional_data_loading_kwargs = {} | |
if opt.num_workers > 0: | |
# NOTE: this option _ONLY_ applies if num_workers > 0 | |
# or else DataLoader will except | |
optional_data_loading_kwargs[ | |
"prefetch_factor" | |
] = opt.prefetch_factor | |
optional_data_loading_kwargs[ | |
"persistent_workers" | |
] = opt.persistent_workers | |
trainingdata = None | |
if not opt.data == "": | |
train_dataset = MultipleVertexJson( | |
root = opt.data, | |
objectofinterest=opt.object, | |
keep_orientation = True, | |
noise = opt.noise, | |
sigma = opt.sigma, | |
data_size = opt.datasize, | |
save = opt.save, | |
transform = transform, | |
normal = normal_imgs, | |
target_transform = transforms.Compose([ | |
transforms.Resize(opt.imagesize//8), | |
]), | |
) | |
# If you set shuffle=True for both, the data will be shuffled at both the process and batch level, | |
# which may improve the randomness of the batches seen by each process, | |
# but it may also increase the communication overhead. | |
num_replicas = world_size | |
train_sampler = DistributedSampler(dataset=train_dataset, num_replicas=num_replicas, rank=world_rank, shuffle=True) if is_distributed else None | |
print('train data size: ', len(train_dataset)) | |
trainingdata = torch.utils.data.DataLoader(train_dataset, | |
batch_size = batch_size, | |
sampler=train_sampler, | |
shuffle = (train_sampler is None), | |
num_workers = opt.num_workers, | |
pin_memory = True, | |
**optional_data_loading_kwargs | |
) | |
print('training data len: ', len(trainingdata.dataset)) | |
if opt.save: | |
for i in range(2): | |
images = iter(trainingdata).next() | |
if normal_imgs is None: | |
normal_imgs = [0,1] | |
save_image(images['img'],'{}/train_{}.png'.format( opt.checkpoints,str(i).zfill(5)),mean=normal_imgs[0],std=normal_imgs[1]) | |
print (i) | |
print ('Checkpoints are saved in {}'.format(opt.checkpoints)) | |
quit() | |
testingdata = None | |
if not opt.datatest == "": | |
testingdata = torch.utils.data.DataLoader( | |
MultipleVertexJson( | |
root = opt.datatest, | |
objectofinterest=opt.object, | |
keep_orientation = True, | |
noise = opt.noise, | |
sigma = opt.sigma, | |
data_size = opt.datasize, | |
save = opt.save, | |
transform = transform, | |
normal = normal_imgs, | |
target_transform = transforms.Compose([ | |
transforms.Resize(opt.imagesize//8), | |
]), | |
), | |
batch_size = batch_size, | |
shuffle = True, | |
num_workers = opt.num_workers, | |
pin_memory = True) | |
print("batch size is: ", batch_size) | |
if not trainingdata is None: | |
print('training data: {} batches'.format(len(trainingdata))) | |
if not testingdata is None: | |
print ("testing data: {} batches".format(len(testingdata))) | |
print('load models') | |
print("torch.cuda.device_count(): ", torch.cuda.device_count()) | |
print('type opt.gpuids: {}'.format(type(opt.gpuids))) | |
print("gpuids are: {}".format(opt.gpuids)) | |
#net = DopeNetwork(pretrained=opt.model_arch_pretrained).cuda() | |
net = DopeNetwork(pretrained=opt.model_arch_pretrained).cuda(local_rank) | |
# net = torch.nn.DataParallel(net,device_ids=opt.gpuids).cuda() | |
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[local_rank], output_device=local_rank) # do we need output_device=local_rank? | |
if opt.net != '': ## what is this doing mona | |
net.load_state_dict(torch.load(opt.net)) | |
parameters = filter(lambda p: p.requires_grad, net.parameters()) ### Is this even necessary? | |
optimizer = optim.Adam(parameters, lr=opt.learning_rate) | |
# scheduler = CosineAnnealingLR(optimizer, T_max=opt.num_epochs) #, eta_min=1e-5 # not using lr scheduler for now | |
# optimizer = optim.SGD(parameters, lr=opt.learning_rate, momentum=opt.momentum, nesterov=True) | |
if self_is_main_node: | |
with open (opt.checkpoints + '/loss_train.csv', 'w') as file: | |
file.write('epoch,batchid,loss\n') | |
with open (opt.checkpoints + '/loss_test.csv', 'w') as file: | |
file.write('epoch,batchid,loss\n') | |
nb_update_network = 0 | |
def _runnetwork(epoch, loader, train=True): | |
global nb_update_network | |
# net | |
if train: | |
net.train() | |
else: | |
net.eval() | |
for batch_idx, targets in enumerate(loader): | |
data = Variable(targets['img'].cuda()) | |
output_belief, output_affinities = net(data) | |
if train: | |
optimizer.zero_grad() | |
target_belief = Variable(targets['beliefs'].cuda()) | |
target_affinity = Variable(targets['affinities'].cuda()) | |
loss = None | |
# Belief maps loss | |
for l in output_belief: #output, each belief map layers. | |
if loss is None: | |
loss = ((l - target_belief) * (l-target_belief)).mean() | |
else: | |
loss_tmp = ((l - target_belief) * (l-target_belief)).mean() | |
loss += loss_tmp | |
# Affinities loss | |
for l in output_affinities: #output, each belief map layers. | |
loss_tmp = ((l - target_affinity) * (l-target_affinity)).mean() | |
loss += loss_tmp | |
if train: | |
loss.backward() | |
optimizer.step() | |
nb_update_network+=1 | |
if train: | |
namefile = '/loss_train.csv' | |
else: | |
namefile = '/loss_test.csv' | |
if self_is_main_node: | |
with open (opt.checkpoints + namefile,'a') as file: | |
s = '{}, {},{:.15f}\n'.format( | |
epoch,batch_idx,loss.data.item()) | |
file.write(s) | |
if train: | |
if batch_idx % opt.loginterval == 0: | |
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format( | |
epoch, batch_idx * len(data), len(loader.dataset), | |
100. * batch_idx / len(loader), loss.data.item())) | |
print("epoch is: {} and train loss is {}".format(epoch, loss)) | |
if self_is_main_node: | |
mlflow.log_metric("train_loss", loss.data.item()) | |
else: | |
if batch_idx % opt.loginterval == 0: | |
print('Test Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format( | |
epoch, batch_idx * len(data), len(loader.dataset), | |
100. * batch_idx / len(loader), loss.data.item())) | |
print("epoch is: {} and test loss is {}".format(epoch, loss)) | |
if self_is_main_node: | |
mlflow.log_metric("test_loss", loss.data.item()) | |
# break | |
if not opt.nbupdates is None and nb_update_network > int(opt.nbupdates): | |
if self_is_main_node and opt.checkpoints is not None: | |
torch.save(net.state_dict(), '{}/net_{}.pth'.format(opt.checkpoints, opt.namefile)) | |
break | |
dist.barrier() | |
for epoch in range(1, opt.num_epochs + 1): | |
if not trainingdata is None: | |
_runnetwork(epoch,trainingdata) | |
# scheduler.step() # not using scheduler for now | |
if not opt.datatest == "": | |
_runnetwork(epoch,testingdata,train = False) | |
if opt.data == "": | |
break # lets get out of this if we are only testing | |
if self_is_main_node and opt.checkpoints is not None: | |
try: | |
torch.save(net.state_dict(), '{}/net_{}_{}_{}.pth'.format(opt.checkpoints, opt.object, opt.namefile ,epoch)) | |
except: | |
pass | |
if not opt.nbupdates is None and nb_update_network > int(opt.nbupdates): | |
break | |
print ("end:" , datetime.datetime.now().time()) | |
if self_is_main_node: | |
print(mlflow.active_run().info.run_uuid) | |
if mlflow.active_run(): | |
mlflow.end_run() | |
def cleanup(): | |
dist.destroy_process_group() | |
cleanup() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment