remv(col_blocks);
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+ int num_to_keep = 0;
+ for (int i = 0; i < boxes_num; i++) {
+ int nblock = i / threadsPerBlock;
+ int inblock = i % threadsPerBlock;
+
+ if (!(remv[nblock] & (1ULL << inblock))) {
+ keep_out[num_to_keep++] = i;
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
+ for (int j = nblock; j < col_blocks; j++) {
+ remv[j] |= p[j];
+ }
+ }
+ }
+ *num_out = num_to_keep;
+
+ CUDA_CHECK(cudaFree(boxes_dev));
+ CUDA_CHECK(cudaFree(mask_dev));
+}
diff --git a/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms/py_cpu_nms.py b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms/py_cpu_nms.py
new file mode 100644
index 0000000..54e7b25
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms/py_cpu_nms.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def py_cpu_nms(dets, thresh):
+ """Pure Python NMS baseline."""
+ x1 = dets[:, 0]
+ y1 = dets[:, 1]
+ x2 = dets[:, 2]
+ y2 = dets[:, 3]
+ scores = dets[:, 4]
+
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ order = scores.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ i = order[0]
+ keep.append(i)
+ xx1 = np.maximum(x1[i], x1[order[1:]])
+ yy1 = np.maximum(y1[i], y1[order[1:]])
+ xx2 = np.minimum(x2[i], x2[order[1:]])
+ yy2 = np.minimum(y2[i], y2[order[1:]])
+
+ w = np.maximum(0.0, xx2 - xx1 + 1)
+ h = np.maximum(0.0, yy2 - yy1 + 1)
+ inter = w * h
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+ inds = np.where(ovr <= thresh)[0]
+ order = order[inds + 1]
+
+ return keep
diff --git a/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms_wrapper.py b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms_wrapper.py
new file mode 100644
index 0000000..d529875
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/nms_wrapper.py
@@ -0,0 +1,15 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+from .nms.cpu_nms import cpu_nms, cpu_soft_nms
+
+def nms(dets, thresh):
+ """Dispatch to either CPU or GPU NMS implementations."""
+
+ if dets.shape[0] == 0:
+ return []
+ return cpu_nms(dets, thresh)
diff --git a/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/prior_box.py b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/prior_box.py
new file mode 100644
index 0000000..e553667
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/prior_box.py
@@ -0,0 +1,43 @@
+import torch
+from itertools import product as product
+import numpy as np
+from math import ceil
+
+
+class PriorBox(object):
+ def __init__(self, cfg, image_size=None, phase='train'):
+ super(PriorBox, self).__init__()
+ #self.aspect_ratios = cfg['aspect_ratios']
+ self.min_sizes = cfg['min_sizes']
+ self.steps = cfg['steps']
+ self.clip = cfg['clip']
+ self.image_size = image_size
+ self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps]
+
+ def forward(self):
+ anchors = []
+ for k, f in enumerate(self.feature_maps):
+ min_sizes = self.min_sizes[k]
+ for i, j in product(range(f[0]), range(f[1])):
+ for min_size in min_sizes:
+ s_kx = min_size / self.image_size[1]
+ s_ky = min_size / self.image_size[0]
+ if min_size == 32:
+ dense_cx = [x*self.steps[k]/self.image_size[1] for x in [j+0, j+0.25, j+0.5, j+0.75]]
+ dense_cy = [y*self.steps[k]/self.image_size[0] for y in [i+0, i+0.25, i+0.5, i+0.75]]
+ for cy, cx in product(dense_cy, dense_cx):
+ anchors += [cx, cy, s_kx, s_ky]
+ elif min_size == 64:
+ dense_cx = [x*self.steps[k]/self.image_size[1] for x in [j+0, j+0.5]]
+ dense_cy = [y*self.steps[k]/self.image_size[0] for y in [i+0, i+0.5]]
+ for cy, cx in product(dense_cy, dense_cx):
+ anchors += [cx, cy, s_kx, s_ky]
+ else:
+ cx = (j + 0.5) * self.steps[k] / self.image_size[1]
+ cy = (i + 0.5) * self.steps[k] / self.image_size[0]
+ anchors += [cx, cy, s_kx, s_ky]
+ # back to torch land
+ output = torch.Tensor(anchors).view(-1, 4)
+ if self.clip:
+ output.clamp_(max=1, min=0)
+ return output
diff --git a/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/timer.py b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/timer.py
new file mode 100644
index 0000000..e4b3b80
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/FaceBoxesV2/utils/timer.py
@@ -0,0 +1,40 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+
+class Timer(object):
+ """A simple timer."""
+ def __init__(self):
+ self.total_time = 0.
+ self.calls = 0
+ self.start_time = 0.
+ self.diff = 0.
+ self.average_time = 0.
+
+ def tic(self):
+ # using time.time instead of time.clock because time time.clock
+ # does not normalize for multithreading
+ self.start_time = time.time()
+
+ def toc(self, average=True):
+ self.diff = time.time() - self.start_time
+ self.total_time += self.diff
+ self.calls += 1
+ self.average_time = self.total_time / self.calls
+ if average:
+ return self.average_time
+ else:
+ return self.diff
+
+ def clear(self):
+ self.total_time = 0.
+ self.calls = 0
+ self.start_time = 0.
+ self.diff = 0.
+ self.average_time = 0.
diff --git a/LAM_gpro/external/landmark_detection/README.md b/LAM_gpro/external/landmark_detection/README.md
new file mode 100644
index 0000000..68abf0f
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/README.md
@@ -0,0 +1,110 @@
+# STAR Loss: Reducing Semantic Ambiguity in Facial Landmark Detection.
+
+Paper Link: [arxiv](https://arxiv.org/abs/2306.02763) | [CVPR 2023](https://openaccess.thecvf.com/content/CVPR2023/papers/Zhou_STAR_Loss_Reducing_Semantic_Ambiguity_in_Facial_Landmark_Detection_CVPR_2023_paper.pdf)
+
+
+- Pytorch implementation of **S**elf-adap**T**ive **A**mbiguity **R**eduction (**STAR**) loss.
+- STAR loss is a self-adaptive anisotropic direction loss, which can be used in heatmap regression-based methods for facial landmark detection.
+- Specifically, we find that semantic ambiguity results in the anisotropic predicted distribution, which inspires us to use predicted distribution to represent semantic ambiguity. So, we use PCA to indicate the character of the predicted distribution and indirectly formulate the direction and intensity of semantic ambiguity. Based on this, STAR loss adaptively suppresses the prediction error in the ambiguity direction to mitigate the impact of ambiguity annotation in training. More details can be found in our paper.
+
+
+
+
+
+
+
+## Dependencies
+
+* python==3.7.3
+* PyTorch=1.6.0
+* requirements.txt
+
+## Dataset Preparation
+
+ - Step1: Download the raw images from [COFW](http://www.vision.caltech.edu/xpburgos/ICCV13/#dataset), [300W](https://ibug.doc.ic.ac.uk/resources/300-W/), and [WFLW](https://wywu.github.io/projects/LAB/WFLW.html).
+ - Step2: We follow the data preprocess in [ADNet](https://openaccess.thecvf.com/content/ICCV2021/papers/Huang_ADNet_Leveraging_Error-Bias_Towards_Normal_Direction_in_Face_Alignment_ICCV_2021_paper.pdf), and the metadata can be download from [the corresponding repository](https://github.com/huangyangyu/ADNet).
+ - Step3: Make them look like this:
+```script
+# the dataset directory:
+|-- ${image_dir}
+ |-- WFLW
+ | -- WFLW_images
+ |-- 300W
+ | -- afw
+ | -- helen
+ | -- ibug
+ | -- lfpw
+ |-- COFW
+ | -- train
+ | -- test
+|-- ${annot_dir}
+ |-- WFLW
+ |-- train.tsv, test.tsv
+ |-- 300W
+ |-- train.tsv, test.tsv
+ |--COFW
+ |-- train.tsv, test.tsv
+```
+
+## Usage
+* Work directory: set the ${ckpt_dir} in ./conf/alignment.py.
+* Pretrained model:
+
+| Dataset | Model |
+|:-----------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| WFLW | [google](https://drive.google.com/file/d/1aOx0wYEZUfBndYy_8IYszLPG_D2fhxrT/view?usp=sharing) / [baidu](https://pan.baidu.com/s/10vvI-ovs3x9NrdmpnXK6sg?pwd=u0yu) |
+| 300W | [google](https://drive.google.com/file/d/1Fiu3hjjkQRdKsWE9IgyNPdiJSz9_MzA5/view?usp=sharing) / [baidu](https://pan.baidu.com/s/1bjUhLq1zS1XSl1nX78fU7A?pwd=yb2s) |
+| COFW | [google](https://drive.google.com/file/d/1NFcZ9jzql_jnn3ulaSzUlyhS05HWB9n_/view?usp=drive_link) / [baidu](https://pan.baidu.com/s/1XO6hDZ8siJLTgFcpyu1Tzw?pwd=m57n) |
+
+
+### Training
+```shell
+python main.py --mode=train --device_ids=0,1,2,3 \
+ --image_dir=${image_dir} --annot_dir=${annot_dir} \
+ --data_definition={WFLW, 300W, COFW}
+```
+
+### Testing
+```shell
+python main.py --mode=test --device_ids=0 \
+ --image_dir=${image_dir} --annot_dir=${annot_dir} \
+ --data_definition={WFLW, 300W, COFW} \
+ --pretrained_weight=${model_path} \
+```
+
+### Evaluation
+```shell
+python evaluate.py --device_ids=0 \
+ --model_path=${model_path} --metadata_path=${metadata_path} \
+ --image_dir=${image_dir} --data_definition={WFLW, 300W, COFW} \
+```
+
+To test on your own image, the following code could be considered:
+```shell
+python demo.py
+```
+
+
+## Results
+The models trained by STAR Loss achieved **SOTA** performance in all of COFW, 300W and WFLW datasets.
+
+
+
+
+
+## BibTeX Citation
+Please consider citing our papers in your publications if the project helps your research. BibTeX reference is as follows.
+```
+@inproceedings{Zhou_2023_CVPR,
+ author = {Zhou, Zhenglin and Li, Huaxia and Liu, Hong and Wang, Nanyang and Yu, Gang and Ji, Rongrong},
+ title = {STAR Loss: Reducing Semantic Ambiguity in Facial Landmark Detection},
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ month = {June},
+ year = {2023},
+ pages = {15475-15484}
+}
+```
+
+## Acknowledgments
+This repository is built on top of [ADNet](https://github.com/huangyangyu/ADNet).
+Thanks for this strong baseline.
diff --git a/LAM_gpro/external/landmark_detection/conf/__init__.py b/LAM_gpro/external/landmark_detection/conf/__init__.py
new file mode 100644
index 0000000..2f92d0e
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/conf/__init__.py
@@ -0,0 +1 @@
+from .alignment import Alignment
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/conf/alignment.py b/LAM_gpro/external/landmark_detection/conf/alignment.py
new file mode 100644
index 0000000..ac58e1d
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/conf/alignment.py
@@ -0,0 +1,239 @@
+import os.path as osp
+from .base import Base
+
+
+class Alignment(Base):
+ """
+ Alignment configure file, which contains training parameters of alignment.
+ """
+
+ def __init__(self, args):
+ super(Alignment, self).__init__('alignment')
+ self.ckpt_dir = '/mnt/workspace/humanAIGC/project/STAR/weights'
+ self.net = "stackedHGnet_v1"
+ self.nstack = 4
+ self.loader_type = "alignment"
+ self.data_definition = "300W" # COFW, 300W, WFLW
+ self.test_file = "test.tsv"
+
+ # image
+ self.channels = 3
+ self.width = 256
+ self.height = 256
+ self.means = (127.5, 127.5, 127.5)
+ self.scale = 1 / 127.5
+ self.aug_prob = 1.0
+
+ self.display_iteration = 10
+ self.val_epoch = 1
+ self.valset = "test.tsv"
+ self.norm_type = 'default'
+ self.encoder_type = 'default'
+ self.decoder_type = 'default'
+
+ # scheduler & optimizer
+ self.milestones = [200, 350, 450]
+ self.max_epoch = 260
+ self.optimizer = "adam"
+ self.learn_rate = 0.001
+ self.weight_decay = 0.00001
+ self.betas = [0.9, 0.999]
+ self.gamma = 0.1
+
+ # batch_size & workers
+ self.batch_size = 32
+ self.train_num_workers = 16
+ self.val_batch_size = 32
+ self.val_num_workers = 16
+ self.test_batch_size = 16
+ self.test_num_workers = 0
+
+ # tricks
+ self.ema = True
+ self.add_coord = True
+ self.use_AAM = True
+
+ # loss
+ self.loss_func = "STARLoss_v2"
+
+ # STAR Loss paras
+ self.star_w = 1
+ self.star_dist = 'smoothl1'
+
+ self.init_from_args(args)
+
+ # COFW
+ if self.data_definition == "COFW":
+ self.edge_info = (
+ (True, (0, 4, 2, 5)), # RightEyebrow
+ (True, (1, 6, 3, 7)), # LeftEyebrow
+ (True, (8, 12, 10, 13)), # RightEye
+ (False, (9, 14, 11, 15)), # LeftEye
+ (True, (18, 20, 19, 21)), # Nose
+ (True, (22, 26, 23, 27)), # LowerLip
+ (True, (22, 24, 23, 25)), # UpperLip
+ )
+ if self.norm_type == 'ocular':
+ self.nme_left_index = 8 # ocular
+ self.nme_right_index = 9 # ocular
+ elif self.norm_type in ['pupil', 'default']:
+ self.nme_left_index = 16 # pupil
+ self.nme_right_index = 17 # pupil
+ else:
+ raise NotImplementedError
+ self.classes_num = [29, 7, 29]
+ self.crop_op = True
+ self.flip_mapping = (
+ [0, 1], [4, 6], [2, 3], [5, 7], [8, 9], [10, 11], [12, 14], [16, 17], [13, 15], [18, 19], [22, 23],
+ )
+ self.image_dir = osp.join(self.image_dir, 'COFW')
+ # 300W
+ elif self.data_definition == "300W":
+ self.edge_info = (
+ (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)), # FaceContour
+ (False, (17, 18, 19, 20, 21)), # RightEyebrow
+ (False, (22, 23, 24, 25, 26)), # LeftEyebrow
+ (False, (27, 28, 29, 30)), # NoseLine
+ (False, (31, 32, 33, 34, 35)), # Nose
+ (True, (36, 37, 38, 39, 40, 41)), # RightEye
+ (True, (42, 43, 44, 45, 46, 47)), # LeftEye
+ (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59)), # OuterLip
+ (True, (60, 61, 62, 63, 64, 65, 66, 67)), # InnerLip
+ )
+ if self.norm_type in ['ocular', 'default']:
+ self.nme_left_index = 36 # ocular
+ self.nme_right_index = 45 # ocular
+ elif self.norm_type == 'pupil':
+ self.nme_left_index = [36, 37, 38, 39, 40, 41] # pupil
+ self.nme_right_index = [42, 43, 44, 45, 46, 47] # pupil
+ else:
+ raise NotImplementedError
+ self.classes_num = [68, 9, 68]
+ self.crop_op = True
+ self.flip_mapping = (
+ [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
+ [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
+ [31, 35], [32, 34],
+ [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+ [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55],
+ )
+ self.image_dir = osp.join(self.image_dir, '300W')
+ # self.image_dir = osp.join(self.image_dir, '300VW_images')
+ # 300VW
+ elif self.data_definition == "300VW":
+ self.edge_info = (
+ (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)), # FaceContour
+ (False, (17, 18, 19, 20, 21)), # RightEyebrow
+ (False, (22, 23, 24, 25, 26)), # LeftEyebrow
+ (False, (27, 28, 29, 30)), # NoseLine
+ (False, (31, 32, 33, 34, 35)), # Nose
+ (True, (36, 37, 38, 39, 40, 41)), # RightEye
+ (True, (42, 43, 44, 45, 46, 47)), # LeftEye
+ (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59)), # OuterLip
+ (True, (60, 61, 62, 63, 64, 65, 66, 67)), # InnerLip
+ )
+ if self.norm_type in ['ocular', 'default']:
+ self.nme_left_index = 36 # ocular
+ self.nme_right_index = 45 # ocular
+ elif self.norm_type == 'pupil':
+ self.nme_left_index = [36, 37, 38, 39, 40, 41] # pupil
+ self.nme_right_index = [42, 43, 44, 45, 46, 47] # pupil
+ else:
+ raise NotImplementedError
+ self.classes_num = [68, 9, 68]
+ self.crop_op = True
+ self.flip_mapping = (
+ [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
+ [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
+ [31, 35], [32, 34],
+ [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+ [48, 54], [49, 53], [50, 52], [61, 63], [60, 64], [67, 65], [58, 56], [59, 55],
+ )
+ self.image_dir = osp.join(self.image_dir, '300VW_Dataset_2015_12_14')
+ # WFLW
+ elif self.data_definition == "WFLW":
+ self.edge_info = (
+ (False, (
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+ 27,
+ 28, 29, 30, 31, 32)), # FaceContour
+ (True, (33, 34, 35, 36, 37, 38, 39, 40, 41)), # RightEyebrow
+ (True, (42, 43, 44, 45, 46, 47, 48, 49, 50)), # LeftEyebrow
+ (False, (51, 52, 53, 54)), # NoseLine
+ (False, (55, 56, 57, 58, 59)), # Nose
+ (True, (60, 61, 62, 63, 64, 65, 66, 67)), # RightEye
+ (True, (68, 69, 70, 71, 72, 73, 74, 75)), # LeftEye
+ (True, (76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87)), # OuterLip
+ (True, (88, 89, 90, 91, 92, 93, 94, 95)), # InnerLip
+ )
+ if self.norm_type in ['ocular', 'default']:
+ self.nme_left_index = 60 # ocular
+ self.nme_right_index = 72 # ocular
+ elif self.norm_type == 'pupil':
+ self.nme_left_index = 96 # pupils
+ self.nme_right_index = 97 # pupils
+ else:
+ raise NotImplementedError
+ self.classes_num = [98, 9, 98]
+ self.crop_op = True
+ self.flip_mapping = (
+ [0, 32], [1, 31], [2, 30], [3, 29], [4, 28], [5, 27], [6, 26], [7, 25], [8, 24], [9, 23], [10, 22],
+ [11, 21], [12, 20], [13, 19], [14, 18], [15, 17], # cheek
+ [33, 46], [34, 45], [35, 44], [36, 43], [37, 42], [38, 50], [39, 49], [40, 48], [41, 47], # elbrow
+ [60, 72], [61, 71], [62, 70], [63, 69], [64, 68], [65, 75], [66, 74], [67, 73],
+ [55, 59], [56, 58],
+ [76, 82], [77, 81], [78, 80], [87, 83], [86, 84],
+ [88, 92], [89, 91], [95, 93], [96, 97]
+ )
+ self.image_dir = osp.join(self.image_dir, 'WFLW', 'WFLW_images')
+
+ self.label_num = self.nstack * 3 if self.use_AAM else self.nstack
+ self.loss_weights, self.criterions, self.metrics = [], [], []
+ for i in range(self.nstack):
+ factor = (2 ** i) / (2 ** (self.nstack - 1))
+ if self.use_AAM:
+ self.loss_weights += [factor * weight for weight in [1.0, 10.0, 10.0]]
+ self.criterions += [self.loss_func, "AWingLoss", "AWingLoss"]
+ self.metrics += ["NME", None, None]
+ else:
+ self.loss_weights += [factor * weight for weight in [1.0]]
+ self.criterions += [self.loss_func, ]
+ self.metrics += ["NME", ]
+
+ self.key_metric_index = (self.nstack - 1) * 3 if self.use_AAM else (self.nstack - 1)
+
+ # data
+ self.folder = self.get_foldername()
+ self.work_dir = osp.join(self.ckpt_dir, self.data_definition, self.folder)
+ self.model_dir = osp.join(self.work_dir, 'model')
+ self.log_dir = osp.join(self.work_dir, 'log')
+
+ self.train_tsv_file = osp.join(self.annot_dir, self.data_definition, "train.tsv")
+ self.train_pic_dir = self.image_dir
+
+ self.val_tsv_file = osp.join(self.annot_dir, self.data_definition, self.valset)
+ self.val_pic_dir = self.image_dir
+
+ self.test_tsv_file = osp.join(self.annot_dir, self.data_definition, self.test_file)
+ self.test_pic_dir = self.image_dir
+
+ # self.train_tsv_file = osp.join(self.annot_dir, '300VW', "train.tsv")
+ # self.train_pic_dir = self.image_dir
+
+ # self.val_tsv_file = osp.join(self.annot_dir, '300VW', self.valset)
+ # self.val_pic_dir = self.image_dir
+
+ # self.test_tsv_file = osp.join(self.annot_dir, '300VW', self.test_file)
+ # self.test_pic_dir = self.image_dir
+
+
+ def get_foldername(self):
+ str = ''
+ str += '{}_{}x{}_{}_ep{}_lr{}_bs{}'.format(self.data_definition, self.height, self.width,
+ self.optimizer, self.max_epoch, self.learn_rate, self.batch_size)
+ str += '_{}'.format(self.loss_func)
+ str += '_{}_{}'.format(self.star_dist, self.star_w) if self.loss_func == 'STARLoss' else ''
+ str += '_AAM' if self.use_AAM else ''
+ str += '_{}'.format(self.valset[:-4]) if self.valset != 'test.tsv' else ''
+ str += '_{}'.format(self.id)
+ return str
diff --git a/LAM_gpro/external/landmark_detection/conf/base.py b/LAM_gpro/external/landmark_detection/conf/base.py
new file mode 100644
index 0000000..bd4885c
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/conf/base.py
@@ -0,0 +1,94 @@
+import uuid
+import logging
+import os.path as osp
+from argparse import Namespace
+# from tensorboardX import SummaryWriter
+
+class Base:
+ """
+ Base configure file, which contains the basic training parameters and should be inherited by other attribute configure file.
+ """
+
+ def __init__(self, config_name, ckpt_dir='./', image_dir='./', annot_dir='./'):
+ self.type = config_name
+ self.id = str(uuid.uuid4())
+ self.note = ""
+
+ self.ckpt_dir = ckpt_dir
+ self.image_dir = image_dir
+ self.annot_dir = annot_dir
+
+ self.loader_type = "alignment"
+ self.loss_func = "STARLoss"
+
+ # train
+ self.batch_size = 128
+ self.val_batch_size = 1
+ self.test_batch_size = 32
+ self.channels = 3
+ self.width = 256
+ self.height = 256
+
+ # mean values in r, g, b channel.
+ self.means = (127, 127, 127)
+ self.scale = 0.0078125
+
+ self.display_iteration = 100
+ self.milestones = [50, 80]
+ self.max_epoch = 100
+
+ self.net = "stackedHGnet_v1"
+ self.nstack = 4
+
+ # ["adam", "sgd"]
+ self.optimizer = "adam"
+ self.learn_rate = 0.1
+ self.momentum = 0.01 # caffe: 0.99
+ self.weight_decay = 0.0
+ self.nesterov = False
+ self.scheduler = "MultiStepLR"
+ self.gamma = 0.1
+
+ self.loss_weights = [1.0]
+ self.criterions = ["SoftmaxWithLoss"]
+ self.metrics = ["Accuracy"]
+ self.key_metric_index = 0
+ self.classes_num = [1000]
+ self.label_num = len(self.classes_num)
+
+ # model
+ self.ema = False
+ self.use_AAM = True
+
+ # visualization
+ self.writer = None
+
+ # log file
+ self.logger = None
+
+ def init_instance(self):
+ # self.writer = SummaryWriter(logdir=self.log_dir, comment=self.type)
+ log_formatter = logging.Formatter("%(asctime)s %(levelname)-8s: %(message)s")
+ root_logger = logging.getLogger()
+ file_handler = logging.FileHandler(osp.join(self.log_dir, "log.txt"))
+ file_handler.setFormatter(log_formatter)
+ file_handler.setLevel(logging.NOTSET)
+ root_logger.addHandler(file_handler)
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(log_formatter)
+ console_handler.setLevel(logging.NOTSET)
+ root_logger.addHandler(console_handler)
+ root_logger.setLevel(logging.NOTSET)
+ self.logger = root_logger
+
+ def __del__(self):
+ # tensorboard --logdir self.log_dir
+ if self.writer is not None:
+ # self.writer.export_scalars_to_json(self.log_dir + "visual.json")
+ self.writer.close()
+
+ def init_from_args(self, args: Namespace):
+ args_vars = vars(args)
+ for key, value in args_vars.items():
+ if hasattr(self, key) and value is not None:
+ setattr(self, key, value)
diff --git a/LAM_gpro/external/landmark_detection/config.json b/LAM_gpro/external/landmark_detection/config.json
new file mode 100644
index 0000000..35831f0
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/config.json
@@ -0,0 +1,15 @@
+{
+ "Token":"bpt4JPotFA6bpdknR9ZDCw",
+ "business_flag": "shadow_cv_face",
+ "model_local_file_path": "/apdcephfs_cq3/share_1134483/charlinzhou/Documents/awesome-tools/jizhi/",
+ "host_num": 1,
+ "host_gpu_num": 1,
+ "GPUName": "V100",
+ "is_elasticity": true,
+ "enable_evicted_pulled_up": true,
+ "task_name": "20230312_slpt_star_bb_init_eigen_box_align_smoothl1-1",
+ "task_flag": "20230312_slpt_star_bb_init_eigen_box_align_smoothl1-1",
+ "model_name": "20230312_slpt_star_bb_init_eigen_box_align_smoothl1-1",
+ "image_full_name": "mirrors.tencent.com/haroldzcli/py36-pytorch1.7.1-torchvision0.8.2-cuda10.1-cudnn7.6",
+ "start_cmd": "./start_slpt.sh /apdcephfs_cq3/share_1134483/charlinzhou/Documents/SLPT_Training train.py --loss_func=star --bb_init --eigen_box --dist_func=align_smoothl1"
+}
diff --git a/LAM_gpro/external/landmark_detection/data_processor/CheckFaceKeyPoint.py b/LAM_gpro/external/landmark_detection/data_processor/CheckFaceKeyPoint.py
new file mode 100644
index 0000000..d15d8f3
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/data_processor/CheckFaceKeyPoint.py
@@ -0,0 +1,147 @@
+import os
+
+import cv2
+import numpy as np
+from PIL import Image
+
+selected_indices_old = [
+ 2311,
+ 2416,
+ 2437,
+ 2460,
+ 2495,
+ 2518,
+ 2520,
+ 2627,
+ 4285,
+ 4315,
+ 6223,
+ 6457,
+ 6597,
+ 6642,
+ 6974,
+ 7054,
+ 7064,
+ 7182,
+ 7303,
+ 7334,
+ 7351,
+ 7368,
+ 7374,
+ 7493,
+ 7503,
+ 7626,
+ 8443,
+ 8562,
+ 8597,
+ 8701,
+ 8817,
+ 8953,
+ 11213,
+ 11261,
+ 11317,
+ 11384,
+ 11600,
+ 11755,
+ 11852,
+ 11891,
+ 11945,
+ 12010,
+ 12354,
+ 12534,
+ 12736,
+ 12880,
+ 12892,
+ 13004,
+ 13323,
+ 13371,
+ 13534,
+ 13575,
+ 14874,
+ 14949,
+ 14977,
+ 15052,
+ 15076,
+ 15291,
+ 15620,
+ 15758,
+ 16309,
+ 16325,
+ 16348,
+ 16390,
+ 16489,
+ 16665,
+ 16891,
+ 17147,
+ 17183,
+ 17488,
+ 17549,
+ 17657,
+ 17932,
+ 19661,
+ 20162,
+ 20200,
+ 20238,
+ 20286,
+ 20432,
+ 20834,
+ 20954,
+ 21015,
+ 21036,
+ 21117,
+ 21299,
+ 21611,
+ 21632,
+ 21649,
+ 22722,
+ 22759,
+ 22873,
+ 23028,
+ 23033,
+ 23082,
+ 23187,
+ 23232,
+ 23302,
+ 23413,
+ 23430,
+ 23446,
+ 23457,
+ 23548,
+ 23636,
+ 32060,
+ 32245,
+]
+
+selected_indices = list()
+with open('/home/gyalex/Desktop/face_anno.txt', 'r') as f:
+ lines = f.readlines()
+ for line in lines:
+ hh = line.strip().split()
+ if len(hh) > 0:
+ pid = hh[0].find('.')
+ if pid != -1:
+ s = hh[0][pid+1:len(hh[0])]
+ print(s)
+ selected_indices.append(int(s))
+
+f.close()
+
+dir = '/media/gyalex/Data/face_ldk_dataset/MHC_LightingPreset_Portrait_RT_0_19/MHC_LightingPreset_Portrait_RT_seq_000015'
+
+for idx in range(500):
+ img = os.path.join(dir, "view_1/MHC_LightingPreset_Portrait_RT_seq_000015_FinalImage_" + str(idx).zfill(4) + ".jpeg")
+ lmd = os.path.join(dir, "mesh/mesh_screen" + str(idx+5).zfill(7) + ".npy")
+
+ img = cv2.imread(img)
+ # c = 511 / 2
+ # lmd = np.load(lmd) * c + c
+ # lmd[:, 1] = 511 - lmd[:, 1]
+ lmd = np.load(lmd)[selected_indices]
+ for i in range(lmd.shape[0]):
+ p = lmd[i]
+ x, y = round(float(p[0])), round(float(p[1]))
+ print(p)
+ cv2.circle(img, (x, y), 2, (0, 0, 255), -1)
+
+ cv2.imshow('win', img)
+ cv2.waitKey(0)
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/data_processor/align.py b/LAM_gpro/external/landmark_detection/data_processor/align.py
new file mode 100644
index 0000000..be9920e
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/data_processor/align.py
@@ -0,0 +1,193 @@
+import numpy as np
+import open3d as o3d
+from scipy.spatial.transform import Rotation
+from scipy.linalg import orthogonal_procrustes
+
+from open3d.pipelines.registration import registration_ransac_based_on_correspondence
+
+
+def rigid_transform_3D(A, B):
+ assert A.shape == B.shape, "Input arrays must have the same shape"
+ assert A.shape[1] == 3, "Input arrays must be Nx3"
+
+ N = A.shape[0] # Number of points
+
+ # Compute centroids of A and B
+ centroid_A = np.mean(A, axis=0)
+ centroid_B = np.mean(B, axis=0)
+
+ # Center the points around the centroids
+ AA = A - centroid_A
+ BB = B - centroid_B
+
+ # H = AA^T * BB
+ H = np.dot(AA.T, BB)
+
+ # Singular Value Decomposition
+ U, S, Vt = np.linalg.svd(H)
+
+ # Compute rotation
+ R = np.dot(Vt.T, U.T)
+
+ # Ensure a proper rotation (det(R) should be +1)
+ if np.linalg.det(R) < 0:
+ Vt[2, :] *= -1
+ R = np.dot(Vt.T, U.T)
+
+ # Compute translation
+ t = centroid_B - np.dot(R, centroid_A)
+
+ # Construct the transform matrix (4x4)
+ transform_matrix = np.eye(4)
+ transform_matrix[:3, :3] = R
+ transform_matrix[:3, 3] = t
+
+ return transform_matrix
+
+
+def compute_rigid_transform(points1, points2):
+ """
+ 计算从points1到points2的刚体变换(包括尺度、旋转和平移)。
+
+ 参数:
+ points1, points2: np.ndarray, 形状为(68, 3)的数组,分别为两组3D对应点。
+
+ 返回:
+ scale: float, 尺度因子
+ R: np.ndarray, 3x3的旋转矩阵
+ t: np.ndarray, 3维的平移向量
+ """
+ # 中心化
+ mean1 = np.mean(points1, axis=0)
+ centered_points1 = points1 - mean1
+ mean2 = np.mean(points2, axis=0)
+ centered_points2 = points2 - mean2
+
+ # 使用orthogonal_procrustes计算旋转和平移
+ R, _ = orthogonal_procrustes(centered_points1, centered_points2)
+ t = mean2 - R @ mean1 # 计算平移向量
+
+ # 计算尺度因子
+ scale = np.mean(np.linalg.norm(centered_points2, axis=1) /
+ np.linalg.norm(centered_points1, axis=1))
+
+ return scale, R, t
+
+
+def compute_rigid_transform_new(points_A, points_B):
+ # 中心化
+ center_A = np.mean(points_A, axis=0)
+ center_B = np.mean(points_B, axis=0)
+ points_A_centered = points_A - center_A
+ points_B_centered = points_B - center_B
+
+ # 计算协方差矩阵
+ cov_matrix = np.dot(points_A_centered.T, points_B_centered)
+
+ # SVD分解
+ U, S, Vt = np.linalg.svd(cov_matrix)
+
+ # 确保旋转矩阵为正交且右手系,这里我们取Vt的转置作为旋转矩阵
+ rotation_matrix = np.dot(Vt.T, U.T)
+
+ # 检查行列式是否为-1(表示反射,不满足旋转矩阵要求),如果是,则调整一个列的符号
+ if np.linalg.det(rotation_matrix) < 0:
+ Vt[2,:] *= -1
+ rotation_matrix = np.dot(Vt.T, U.T)
+
+ # 计算尺度因子
+ scale = np.trace(np.dot(points_A_centered.T, points_B_centered)) / np.trace(np.dot(points_A_centered.T, points_A_centered))
+
+ # 计算平移向量
+ translation_vector = center_B - scale * np.dot(rotation_matrix, center_A)
+
+ return scale, rotation_matrix, translation_vector
+
+
+
+
+# 示范用法
+obj_A = '/home/gyalex/Desktop/our_face.obj'
+obj_B = '/home/gyalex/Desktop/Neutral.obj'
+
+mesh_A = o3d.io.read_triangle_mesh(obj_A)
+mesh_B = o3d.io.read_triangle_mesh(obj_B)
+
+vertices_A = np.asarray(mesh_A.vertices)
+vertices_B = np.asarray(mesh_B.vertices)
+
+list_A = list()
+list_B = list()
+with open('/home/gyalex/Desktop/our_marker.txt', 'r') as f:
+ lines_A = f.readlines()
+ for line in lines_A:
+ hh = line.strip().split()
+ list_A.append(int(hh[0]))
+
+with open('/home/gyalex/Desktop/ARKit_landmarks.txt', 'r') as f:
+ lines_B = f.readlines()
+ for line in lines_B:
+ hh = line.strip().split()
+ list_B.append(int(hh[0]))
+
+A = vertices_A[list_A,:] # 第一组3D点
+B = vertices_B[list_B,:] # 第二组3D点
+
+# scale, R, t = compute_rigid_transform(A, B)
+
+# # 定义尺度变换矩阵
+# scale_matrix = np.eye(4)
+# scale_matrix[0, 0] = scale # x轴方向放大2倍
+# scale_matrix[1, 1] = scale # y轴方向放大2倍
+# scale_matrix[2, 2] = scale # z轴方向放大2倍
+
+# transform_matrix = np.eye(4)
+# transform_matrix[:3, :3] = scale
+# transform_matrix[:3, 3] = R*t
+
+# mesh_A.transform(transform_matrix)
+# # mesh_A.transform(scale_matrix)
+
+# o3d.io.write_triangle_mesh('/home/gyalex/Desktop/our_face_new.obj', mesh_A)
+
+pcd_source = o3d.utility.Vector3dVector(A) # 示例源点云数据
+pcd_target = o3d.utility.Vector3dVector(B) # 示例目标点云数据 + 1偏移,仅作示例
+
+corres_source = list()
+for idx in range(68): corres_source.append(idx)
+corres_target = list()
+for idx in range(68): corres_target.append(idx)
+
+# 根据对应点索引获取实际的对应点坐标
+corres_source_points = pcd_source
+corres_target_points = pcd_target
+
+corres = o3d.utility.Vector2iVector([[src, tgt] for src, tgt in zip(corres_source, corres_target)])
+
+# 应用RANSAC进行基于对应点的配准
+reg_result = registration_ransac_based_on_correspondence(
+ pcd_source,
+ pcd_target,
+ corres,
+ estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+ ransac_n=3,
+ criteria=o3d.pipelines.registration.RANSACConvergenceCriteria(max_iteration=100000, epsilon=1e-6)
+)
+
+# # 使用RANSAC进行配准
+# convergence_criteria = o3d.pipelines.registration.RANSACConvergenceCriteria(max_iteration=50000, max_validation=500)
+# ransac_result = o3d.pipelines.registration.registration_ransac_based_on_correspondence(
+# pcd_source,
+# pcd_target,
+# corres,
+# o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+# 3, # RANSAC阈值,根据实际情况调整
+# convergence_criteria,
+# [o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
+# o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(0.05)],
+# o3d.pipelines.registration.RANSACLoss())
+
+# 应用变换到源mesh
+# mesh_source_aligned = mesh_source.transform(reg_result.transformation)
+
+a = 0
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/data_processor/process_pcd.py b/LAM_gpro/external/landmark_detection/data_processor/process_pcd.py
new file mode 100644
index 0000000..e6183ab
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/data_processor/process_pcd.py
@@ -0,0 +1,250 @@
+import os
+import cv2
+import numpy as np
+import open3d as o3d
+# import pyrender
+# from pyrender import mesh, DirectionalLight, Material, PerspectiveCamera
+
+os.environ['__GL_THREADED_OPTIMIZATIONS'] = '1'
+
+cord_list = []
+with open('./cord.txt', 'r') as f:
+ lines = f.readlines()
+ for line in lines:
+ m = line.split()
+ x = int(m[0])
+ y = int(m[1])
+
+ x = 1000 - x
+ y = 1000 - y
+
+ cord_list.append([x, y])
+
+
+# 假设TXT文件的路径
+output_folder = '/media/gyalex/Data/face_det_dataset/rgbd_data/rgbd'
+if not os.path.exists(output_folder):
+ os.mkdir(output_folder)
+
+for idx in range(32, 33):
+ txt_file_path = '/media/gyalex/Data/face_det_dataset/rgbd_data/PointImage'+ str(idx) + '.txt'
+ _, name = os.path.split(txt_file_path)
+ print(txt_file_path)
+
+ with open(txt_file_path, 'r') as file:
+ points = []
+ rgb_list = []
+ ori_rgb_list = []
+ normal_list = []
+
+ # 逐行读取数据
+ for line in file:
+ # 去除行尾的换行符并分割字符串
+ x, y, z, r, g, b, nx, ny, nz, w = line.split()
+ # 将字符串转换为浮点数
+ x = float(x)
+ y = float(y)
+ z = float(z)
+ r = float(r)
+ g = float(g)
+ b = float(b)
+ nx = float(nx)
+ ny = float(ny)
+ nz = float(nz)
+ # 将点添加到列表中
+ points.append((x, y, z))
+ rgb_list.append((r/255.0, g/255.0 , b/255.0))
+ normal_list.append((nx, ny, nz))
+
+ ori_r = int(r)
+ ori_g = int(g)
+ ori_b = int(b)
+ ori_rgb_list.append((ori_r, ori_g , ori_b))
+
+ np_points = np.asarray(points)
+
+ np_points_a = np_points
+
+ np_colors = np.asarray(rgb_list)
+ np_normals = np.asarray(normal_list)
+
+ np_colors_ori = np.asarray(ori_rgb_list)
+
+ pcd = o3d.geometry.PointCloud()
+ pcd.points = o3d.utility.Vector3dVector(np_points)
+ pcd.colors = o3d.utility.Vector3dVector(np_colors)
+ pcd.normals = o3d.utility.Vector3dVector(np_normals)
+
+ map_dict = {}
+
+ image = np.ones((1000, 1000, 3),dtype=np.uint8)*255
+ for i in range(np.array(pcd.points).shape[0]):
+ x = np.array(pcd.points)[i,0]+400
+ y = np.array(pcd.points)[i,1]+400
+
+ image[int(x),int(y),:] = (np.array(pcd.colors)[i,:]*255).astype(np.uint8)
+ image[int(x+1),int(y),:] = (np.array(pcd.colors)[i,:]*255).astype(np.uint8)
+ image[int(x),int(y+1),:] = (np.array(pcd.colors)[i,:]*255).astype(np.uint8)
+ image[int(x-1),int(y),:] = (np.array(pcd.colors)[i,:]*255).astype(np.uint8)
+ image[int(x),int(y-1),:] = (np.array(pcd.colors)[i,:]*255).astype(np.uint8)
+
+ map_dict[str(int(x)) + '_' + str(int(y))] = i
+ map_dict[str(int(x+1)) + '_' + str(int(y))] = i
+ map_dict[str(int(x)) + '_' + str(int(y+1))] = i
+ map_dict[str(int(x-1)) + '_' + str(int(y))] = i
+ map_dict[str(int(x)) + '_' + str(int(y-1))] = i
+
+ # if [int(y), int(x)] in cord_list:
+ # image[int(x),int(y),:] = np.array([0, 255, 0])
+
+ # if [int(y), int(x+1)] in cord_list:
+ # image[int(x+1),int(y),:] = np.array([0, 255, 0])
+
+ # if [int(y+1), int(x)] in cord_list:
+ # image[int(x),int(y+1),:] = np.array([0, 255, 0])
+
+ # if [int(y), int(x-1)] in cord_list:
+ # image[int(x-1),int(y),:] = np.array([0, 255, 0])
+
+ # if [int(y-1), int(x)] in cord_list:
+ # image[int(x),int(y-1),:] = np.array([0, 255, 0])
+
+ # if [int(y-1), int(x-1)] in cord_list:
+ # image[int(x-1),int(y-1),:] = np.array([0, 255, 0])
+
+ # if [int(y+1), int(x+1)] in cord_list:
+ # image[int(x+1),int(y+1),:] = np.array([0, 255, 0])
+
+ h_list = []
+ for m in cord_list:
+ a, b = m[0], m[1]
+ c = image[int(b),int(a),:][0]
+
+ flag = False
+
+ if image[int(b),int(a),:][1] != 255:
+ h_list.append(str(int(b))+'_'+str(int(a)))
+ flag = True
+ else:
+ if image[int(b)-2,int(a)-2,:][1] != 255:
+ h_list.append(str(int(b)-2)+'_'+str(int(a)-2))
+ flag = True
+ elif image[int(b)+2,int(a)+2,:][1] != 255:
+ h_list.append(str(int(b)+2)+'_'+str(int(a)+2))
+ flag = True
+ elif image[int(b),int(a)-3,:][1] != 255:
+ h_list.append(str(int(b))+'_'+str(int(a)-3))
+ flag = True
+
+ # if flag == False:
+ # cc = image[int(b),int(a),:][1]
+
+ # cv2.circle(image, (465,505), 2, (0, 255, 0), -1)
+
+ # cv2.imshow('win', image)
+ # cv2.waitKey(0)
+
+ with open('pid.txt', 'w') as f:
+ for h in h_list:
+ pid = map_dict[h]
+ s = str(pid) + '\n'
+ f.write(s)
+
+ np_colors[pid,:] = np.array([0, 255, 0])
+
+ f.close()
+
+ pcd0 = o3d.geometry.PointCloud()
+ pcd0.points = o3d.utility.Vector3dVector(np_points)
+ pcd0.colors = o3d.utility.Vector3dVector(np_colors)
+ pcd0.normals = o3d.utility.Vector3dVector(np_normals)
+
+ o3d.io.write_point_cloud('aa.ply', pcd0)
+
+
+ mm = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ image3 = cv2.flip(mm, -1)
+
+ # cv2.imwrite('./rgb.png', image3)
+
+with open('./cord.txt', 'r') as f:
+ lines = f.readlines()
+ for line in lines:
+ m = line.split()
+ x = int(m[0])
+ y = int(m[1])
+
+ x = 1000 - x
+ y = 1000 - y
+
+ cv2.circle(image, (x,y), 2, (0, 255, 0), -1)
+
+ idx = map_dict[str(x)+'_'+str(y)]
+
+ a = 0
+
+# cv2.imshow("win", image)
+# cv2.waitKey(0)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ # import matplotlib.pyplot as plt
+ # plt.imshow(image)
+ # plt.show()
+
+ # save_pcd_path = os.path.join(output_folder, name[:-3]+'ply')
+ # # o3d.io.write_point_cloud(save_pcd_path, pcd)
+
+ # # render
+ # import trimesh
+ # # fuze_trimesh = trimesh.load('/home/gyalex/Desktop/PointImage32.obj')
+ # # mesh = pyrender.Mesh.from_trimesh(fuze_trimesh)
+ # mesh = pyrender.Mesh.from_points(np_points, np_colors_ori, np_normals)
+
+ # import math
+ # camera = PerspectiveCamera(yfov=math.pi / 3, aspectRatio=1.0)
+ # camera_pose = np.array([[-1.0, 0.0, 0.0, 0], \
+ # [0.0, 1.0, 0.0, 0], \
+ # [0.0, 0.0, -1.0, 0], \
+ # [0.0, 0.0, 0.0, 1.0]])
+
+ # # 创建场景
+ # scene = pyrender.Scene()
+ # scene.add(mesh)
+ # scene.add(camera, pose=camera_pose)
+
+ # # light = pyrender.SpotLight(color=np.ones(3), intensity=3.0, innerConeAngle=np.pi/16.0, outerConeAngle=np.pi/6.0)
+ # # scene.add(light, pose=camera_pose)
+
+ # # 渲染场景
+ # renderer = pyrender.OffscreenRenderer(viewport_width=1280, viewport_height=1024)
+ # color, depth = renderer.render(scene)
+
+ # # # 设置场景和光源
+ # # scene = pyrender.Scene()
+ # # scene.add(point_cloud_mesh, 'point_cloud')
+ # # camera = PerspectiveCamera(yfov=45.0, aspectRatio=1.0)
+ # # scene.add(camera)
+
+ # # # 渲染场景
+ # # renderer = pyrender.OffscreenRenderer(viewport_width=1280, viewport_height=1024)
+ # # color, depth = renderer.render(scene)
+
+ # # 保存渲染结果为图片
+ # import cv2
+ # cv2.imshow('win', color)
+
+ # rgb_img = cv2.imread('/media/gyalex/Data/face_det_dataset/rgbd_data/color_32.bmp')
+ # cv2.imshow('win0', rgb_img)
+ # cv2.waitKey(0)
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/evaluate.py b/LAM_gpro/external/landmark_detection/evaluate.py
new file mode 100644
index 0000000..7320242
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/evaluate.py
@@ -0,0 +1,258 @@
+import os
+import cv2
+import math
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+import torch
+
+# private package
+from lib import utility
+
+
+
+class GetCropMatrix():
+ """
+ from_shape -> transform_matrix
+ """
+
+ def __init__(self, image_size, target_face_scale, align_corners=False):
+ self.image_size = image_size
+ self.target_face_scale = target_face_scale
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def process(self, scale, center_w, center_h):
+ if self.align_corners:
+ to_w, to_h = self.image_size - 1, self.image_size - 1
+ else:
+ to_w, to_h = self.image_size, self.image_size
+
+ rot_mu = 0
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+ shift_xy_mu = (0, 0)
+ matrix = self._compose_rotate_and_scale(
+ rot_mu, scale_mu, shift_xy_mu,
+ from_center=[center_w, center_h],
+ to_center=[to_w / 2.0, to_h / 2.0])
+ return matrix
+
+
+class TransformPerspective():
+ """
+ image, matrix3x3 -> transformed_image
+ """
+
+ def __init__(self, image_size):
+ self.image_size = image_size
+
+ def process(self, image, matrix):
+ return cv2.warpPerspective(
+ image, matrix, dsize=(self.image_size, self.image_size),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+
+class TransformPoints2D():
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+
+ def process(self, srcPoints, matrix):
+ # nx3
+ desPoints = np.concatenate([srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1)
+ desPoints = desPoints @ np.transpose(matrix) # nx3
+ desPoints = desPoints[:, :2] / desPoints[:, [2, 2]]
+ return desPoints.astype(srcPoints.dtype)
+
+
+class Alignment:
+ def __init__(self, args, model_path, dl_framework, device_ids):
+ self.input_size = 256
+ self.target_face_scale = 1.0
+ self.dl_framework = dl_framework
+
+ # model
+ if self.dl_framework == "pytorch":
+ # conf
+ self.config = utility.get_config(args)
+ self.config.device_id = device_ids[0]
+ # set environment
+ utility.set_environment(self.config)
+ self.config.init_instance()
+ if self.config.logger is not None:
+ self.config.logger.info("Loaded configure file %s: %s" % (args.config_name, self.config.id))
+ self.config.logger.info("\n" + "\n".join(["%s: %s" % item for item in self.config.__dict__.items()]))
+
+ net = utility.get_net(self.config)
+ if device_ids == [-1]:
+ checkpoint = torch.load(model_path, map_location="cpu")
+ else:
+ checkpoint = torch.load(model_path)
+ net.load_state_dict(checkpoint["net"])
+ net = net.to(self.config.device_id)
+ net.eval()
+ self.alignment = net
+ else:
+ assert False
+
+ self.getCropMatrix = GetCropMatrix(image_size=self.input_size, target_face_scale=self.target_face_scale,
+ align_corners=True)
+ self.transformPerspective = TransformPerspective(image_size=self.input_size)
+ self.transformPoints2D = TransformPoints2D()
+
+ def norm_points(self, points, align_corners=False):
+ if align_corners:
+ # [0, SIZE-1] -> [-1, +1]
+ return points / torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2) * 2 - 1
+ else:
+ # [-0.5, SIZE-0.5] -> [-1, +1]
+ return (points * 2 + 1) / torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1
+
+ def denorm_points(self, points, align_corners=False):
+ if align_corners:
+ # [-1, +1] -> [0, SIZE-1]
+ return (points + 1) / 2 * torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2)
+ else:
+ # [-1, +1] -> [-0.5, SIZE-0.5]
+ return ((points + 1) * torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1) / 2
+
+ def preprocess(self, image, scale, center_w, center_h):
+ matrix = self.getCropMatrix.process(scale, center_w, center_h)
+ input_tensor = self.transformPerspective.process(image, matrix)
+ input_tensor = input_tensor[np.newaxis, :]
+
+ input_tensor = torch.from_numpy(input_tensor)
+ input_tensor = input_tensor.float().permute(0, 3, 1, 2)
+ input_tensor = input_tensor / 255.0 * 2.0 - 1.0
+ input_tensor = input_tensor.to(self.config.device_id)
+ return input_tensor, matrix
+
+ def postprocess(self, srcPoints, coeff):
+ # dstPoints = self.transformPoints2D.process(srcPoints, coeff)
+ # matrix^(-1) * src = dst
+ # src = matrix * dst
+ dstPoints = np.zeros(srcPoints.shape, dtype=np.float32)
+ for i in range(srcPoints.shape[0]):
+ dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][1] * srcPoints[i][1] + coeff[0][2]
+ dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][1] * srcPoints[i][1] + coeff[1][2]
+ return dstPoints
+
+ def analyze(self, image, scale, center_w, center_h):
+ input_tensor, matrix = self.preprocess(image, scale, center_w, center_h)
+
+ if self.dl_framework == "pytorch":
+ with torch.no_grad():
+ output = self.alignment(input_tensor)
+ landmarks = output[-1][0]
+ else:
+ assert False
+
+ landmarks = self.denorm_points(landmarks)
+ landmarks = landmarks.data.cpu().numpy()[0]
+ landmarks = self.postprocess(landmarks, np.linalg.inv(matrix))
+
+ return landmarks
+
+
+def L2(p1, p2):
+ return np.linalg.norm(p1 - p2)
+
+
+def NME(landmarks_gt, landmarks_pv):
+ pts_num = landmarks_gt.shape[0]
+ if pts_num == 29:
+ left_index = 16
+ right_index = 17
+ elif pts_num == 68:
+ left_index = 36
+ right_index = 45
+ elif pts_num == 98:
+ left_index = 60
+ right_index = 72
+
+ nme = 0
+ eye_span = L2(landmarks_gt[left_index], landmarks_gt[right_index])
+ for i in range(pts_num):
+ error = L2(landmarks_pv[i], landmarks_gt[i])
+ nme += error / eye_span
+ nme /= pts_num
+ return nme
+
+
+def evaluate(args, model_path, metadata_path, device_ids, mode):
+ alignment = Alignment(args, model_path, dl_framework="pytorch", device_ids=device_ids)
+ config = alignment.config
+ nme_sum = 0
+ with open(metadata_path, 'r') as f:
+ lines = f.readlines()
+ for k, line in enumerate(tqdm(lines)):
+ item = line.strip().split("\t")
+ image_name, landmarks_5pts, landmarks_gt, scale, center_w, center_h = item[:6]
+ # image & keypoints alignment
+ image_name = image_name.replace('\\', '/')
+ image_name = image_name.replace('//msr-facestore/Workspace/MSRA_EP_Allergan/users/yanghuan/training_data/wflw/rawImages/', '')
+ image_name = image_name.replace('./rawImages/', '')
+ image_path = os.path.join(config.image_dir, image_name)
+ landmarks_gt = np.array(list(map(float, landmarks_gt.split(","))), dtype=np.float32).reshape(-1, 2)
+ scale, center_w, center_h = float(scale), float(center_w), float(center_h)
+
+ image = cv2.imread(image_path)
+ landmarks_pv = alignment.analyze(image, scale, center_w, center_h)
+
+ # NME
+ if mode == "nme":
+ nme = NME(landmarks_gt, landmarks_pv)
+ nme_sum += nme
+ # print("Current NME(%d): %f" % (k + 1, (nme_sum / (k + 1))))
+ else:
+ pass
+
+ if mode == "nme":
+ print("Final NME: %f" % (100*nme_sum / (k + 1)))
+ else:
+ pass
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Evaluation script")
+ parser.add_argument("--config_name", type=str, default="alignment", help="set configure file name")
+ parser.add_argument("--model_path", type=str, default="./train.pkl", help="the path of model")
+ parser.add_argument("--data_definition", type=str, default='WFLW', help="COFW/300W/WFLW")
+ parser.add_argument("--metadata_path", type=str, default="", help="the path of metadata")
+ parser.add_argument("--image_dir", type=str, default="", help="the path of image")
+ parser.add_argument("--device_ids", type=str, default="0", help="set device ids, -1 means use cpu device, >= 0 means use gpu device")
+ parser.add_argument("--mode", type=str, default="nme", help="set the evaluate mode: nme")
+ args = parser.parse_args()
+
+ device_ids = list(map(int, args.device_ids.split(",")))
+ evaluate(
+ args,
+ model_path=args.model_path,
+ metadata_path=args.metadata_path,
+ device_ids=device_ids,
+ mode=args.mode)
diff --git a/LAM_gpro/external/landmark_detection/infer_folder.py b/LAM_gpro/external/landmark_detection/infer_folder.py
new file mode 100644
index 0000000..a34c75d
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/infer_folder.py
@@ -0,0 +1,253 @@
+import cv2
+import math
+import copy
+import numpy as np
+import argparse
+import torch
+import json
+
+# private package
+from lib import utility
+from FaceBoxesV2.faceboxes_detector import *
+
+class GetCropMatrix():
+ """
+ from_shape -> transform_matrix
+ """
+
+ def __init__(self, image_size, target_face_scale, align_corners=False):
+ self.image_size = image_size
+ self.target_face_scale = target_face_scale
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def process(self, scale, center_w, center_h):
+ if self.align_corners:
+ to_w, to_h = self.image_size - 1, self.image_size - 1
+ else:
+ to_w, to_h = self.image_size, self.image_size
+
+ rot_mu = 0
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+ shift_xy_mu = (0, 0)
+ matrix = self._compose_rotate_and_scale(
+ rot_mu, scale_mu, shift_xy_mu,
+ from_center=[center_w, center_h],
+ to_center=[to_w / 2.0, to_h / 2.0])
+ return matrix
+
+
+class TransformPerspective():
+ """
+ image, matrix3x3 -> transformed_image
+ """
+
+ def __init__(self, image_size):
+ self.image_size = image_size
+
+ def process(self, image, matrix):
+ return cv2.warpPerspective(
+ image, matrix, dsize=(self.image_size, self.image_size),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+
+class TransformPoints2D():
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+
+ def process(self, srcPoints, matrix):
+ # nx3
+ desPoints = np.concatenate([srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1)
+ desPoints = desPoints @ np.transpose(matrix) # nx3
+ desPoints = desPoints[:, :2] / desPoints[:, [2, 2]]
+ return desPoints.astype(srcPoints.dtype)
+
+class Alignment:
+ def __init__(self, args, model_path, dl_framework, device_ids):
+ self.input_size = 256
+ self.target_face_scale = 1.0
+ self.dl_framework = dl_framework
+
+ # model
+ if self.dl_framework == "pytorch":
+ # conf
+ self.config = utility.get_config(args)
+ self.config.device_id = device_ids[0]
+ # set environment
+ utility.set_environment(self.config)
+ # self.config.init_instance()
+ # if self.config.logger is not None:
+ # self.config.logger.info("Loaded configure file %s: %s" % (args.config_name, self.config.id))
+ # self.config.logger.info("\n" + "\n".join(["%s: %s" % item for item in self.config.__dict__.items()]))
+
+ net = utility.get_net(self.config)
+ if device_ids == [-1]:
+ checkpoint = torch.load(model_path, map_location="cpu")
+ else:
+ checkpoint = torch.load(model_path)
+ net.load_state_dict(checkpoint["net"])
+
+ if self.config.device_id == -1:
+ net = net.cpu()
+ else:
+ net = net.to(self.config.device_id)
+
+ net.eval()
+ self.alignment = net
+ else:
+ assert False
+
+ self.getCropMatrix = GetCropMatrix(image_size=self.input_size, target_face_scale=self.target_face_scale,
+ align_corners=True)
+ self.transformPerspective = TransformPerspective(image_size=self.input_size)
+ self.transformPoints2D = TransformPoints2D()
+
+ def norm_points(self, points, align_corners=False):
+ if align_corners:
+ # [0, SIZE-1] -> [-1, +1]
+ return points / torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2) * 2 - 1
+ else:
+ # [-0.5, SIZE-0.5] -> [-1, +1]
+ return (points * 2 + 1) / torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1
+
+ def denorm_points(self, points, align_corners=False):
+ if align_corners:
+ # [-1, +1] -> [0, SIZE-1]
+ return (points + 1) / 2 * torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2)
+ else:
+ # [-1, +1] -> [-0.5, SIZE-0.5]
+ return ((points + 1) * torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1) / 2
+
+ def preprocess(self, image, scale, center_w, center_h):
+ matrix = self.getCropMatrix.process(scale, center_w, center_h)
+ input_tensor = self.transformPerspective.process(image, matrix)
+ input_tensor = input_tensor[np.newaxis, :]
+
+ input_tensor = torch.from_numpy(input_tensor)
+ input_tensor = input_tensor.float().permute(0, 3, 1, 2)
+ input_tensor = input_tensor / 255.0 * 2.0 - 1.0
+
+ if self.config.device_id == -1:
+ input_tensor = input_tensor.cpu()
+ else:
+ input_tensor = input_tensor.to(self.config.device_id)
+
+ return input_tensor, matrix
+
+ def postprocess(self, srcPoints, coeff):
+ # dstPoints = self.transformPoints2D.process(srcPoints, coeff)
+ # matrix^(-1) * src = dst
+ # src = matrix * dst
+ dstPoints = np.zeros(srcPoints.shape, dtype=np.float32)
+ for i in range(srcPoints.shape[0]):
+ dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][1] * srcPoints[i][1] + coeff[0][2]
+ dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][1] * srcPoints[i][1] + coeff[1][2]
+ return dstPoints
+
+ def analyze(self, image, scale, center_w, center_h):
+ input_tensor, matrix = self.preprocess(image, scale, center_w, center_h)
+
+ if self.dl_framework == "pytorch":
+ with torch.no_grad():
+ output = self.alignment(input_tensor)
+ landmarks = output[-1][0]
+ else:
+ assert False
+
+ landmarks = self.denorm_points(landmarks)
+ landmarks = landmarks.data.cpu().numpy()[0]
+ landmarks = self.postprocess(landmarks, np.linalg.inv(matrix))
+
+ return landmarks
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="inference script")
+ parser.add_argument('--folder_path', type=str, help='Path to image folder')
+ args = parser.parse_args()
+
+ # args.folder_path = '/media/gyalex/Data/flame/ph_test/head_images/flame/image'
+
+ current_path = os.getcwd()
+
+ use_gpu = True
+ ########### face detection ############
+ if use_gpu:
+ device = torch.device("cuda:0")
+ else:
+ device = torch.device("cpu")
+
+ current_path = os.getcwd()
+ det_model_path = os.path.join(current_path, 'preprocess', 'submodules', 'Landmark_detection', 'FaceBoxesV2/weights/FaceBoxesV2.pth')
+ detector = FaceBoxesDetector('FaceBoxes', det_model_path, use_gpu, device)
+
+ ########### facial alignment ############
+ model_path = os.path.join(current_path, 'preprocess', 'submodules', 'Landmark_detection', 'weights/68_keypoints_model.pkl')
+
+ if use_gpu:
+ device_ids = [0]
+ else:
+ device_ids = [-1]
+
+ args.config_name = 'alignment'
+ alignment = Alignment(args, model_path, dl_framework="pytorch", device_ids=device_ids)
+
+ img_path_list = os.listdir(args.folder_path)
+ kpts_code = dict()
+
+ ########### inference ############
+ for file_name in img_path_list:
+ abs_path = os.path.join(args.folder_path, file_name)
+
+ image = cv2.imread(abs_path)
+ image_draw = copy.deepcopy(image)
+
+ detections, _ = detector.detect(image, 0.6, 1)
+ for idx in range(len(detections)):
+ x1_ori = detections[idx][2]
+ y1_ori = detections[idx][3]
+ x2_ori = x1_ori + detections[idx][4]
+ y2_ori = y1_ori + detections[idx][5]
+
+ scale = max(x2_ori - x1_ori, y2_ori - y1_ori) / 180
+ center_w = (x1_ori + x2_ori) / 2
+ center_h = (y1_ori + y2_ori) / 2
+ scale, center_w, center_h = float(scale), float(center_w), float(center_h)
+
+ landmarks_pv = alignment.analyze(image, scale, center_w, center_h)
+ landmarks_pv_list = landmarks_pv.tolist()
+
+ for num in range(landmarks_pv.shape[0]):
+ cv2.circle(image_draw, (round(landmarks_pv[num][0]), round(landmarks_pv[num][1])),
+ 2, (0, 255, 0), -1)
+
+ kpts_code[file_name] = landmarks_pv_list
+ save_path = args.folder_path[:-5] + 'landmark'
+ cv2.imwrite(os.path.join(save_path, file_name), image_draw)
+
+ path = args.folder_path[:-5]
+ json.dump(kpts_code, open(os.path.join(path, 'keypoint.json'), 'w'))
diff --git a/LAM_gpro/external/landmark_detection/infer_image.py b/LAM_gpro/external/landmark_detection/infer_image.py
new file mode 100644
index 0000000..a2e42a1
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/infer_image.py
@@ -0,0 +1,251 @@
+import cv2
+import math
+import copy
+import numpy as np
+import argparse
+import torch
+
+# private package
+from external.landmark_detection.lib import utility
+from external.landmark_detection.FaceBoxesV2.faceboxes_detector import *
+
+class GetCropMatrix():
+ """
+ from_shape -> transform_matrix
+ """
+
+ def __init__(self, image_size, target_face_scale, align_corners=False):
+ self.image_size = image_size
+ self.target_face_scale = target_face_scale
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def process(self, scale, center_w, center_h):
+ if self.align_corners:
+ to_w, to_h = self.image_size - 1, self.image_size - 1
+ else:
+ to_w, to_h = self.image_size, self.image_size
+
+ rot_mu = 0
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+ shift_xy_mu = (0, 0)
+ matrix = self._compose_rotate_and_scale(
+ rot_mu, scale_mu, shift_xy_mu,
+ from_center=[center_w, center_h],
+ to_center=[to_w / 2.0, to_h / 2.0])
+ return matrix
+
+
+class TransformPerspective():
+ """
+ image, matrix3x3 -> transformed_image
+ """
+
+ def __init__(self, image_size):
+ self.image_size = image_size
+
+ def process(self, image, matrix):
+ return cv2.warpPerspective(
+ image, matrix, dsize=(self.image_size, self.image_size),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+
+class TransformPoints2D():
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+
+ def process(self, srcPoints, matrix):
+ # nx3
+ desPoints = np.concatenate([srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1)
+ desPoints = desPoints @ np.transpose(matrix) # nx3
+ desPoints = desPoints[:, :2] / desPoints[:, [2, 2]]
+ return desPoints.astype(srcPoints.dtype)
+
+class Alignment:
+ def __init__(self, args, model_path, dl_framework, device_ids):
+ self.input_size = 256
+ self.target_face_scale = 1.0
+ self.dl_framework = dl_framework
+
+ # model
+ if self.dl_framework == "pytorch":
+ # conf
+ self.config = utility.get_config(args)
+ self.config.device_id = device_ids[0]
+ # set environment
+ # utility.set_environment(self.config)
+ # self.config.init_instance()
+ # if self.config.logger is not None:
+ # self.config.logger.info("Loaded configure file %s: %s" % (args.config_name, self.config.id))
+ # self.config.logger.info("\n" + "\n".join(["%s: %s" % item for item in self.config.__dict__.items()]))
+
+ net = utility.get_net(self.config)
+ if device_ids == [-1]:
+ checkpoint = torch.load(model_path, map_location="cpu")
+ else:
+ checkpoint = torch.load(model_path)
+ net.load_state_dict(checkpoint["net"])
+
+ if self.config.device_id == -1:
+ net = net.cpu()
+ else:
+ net = net.to(self.config.device_id)
+
+ net.eval()
+ self.alignment = net
+ else:
+ assert False
+
+ self.getCropMatrix = GetCropMatrix(image_size=self.input_size, target_face_scale=self.target_face_scale,
+ align_corners=True)
+ self.transformPerspective = TransformPerspective(image_size=self.input_size)
+ self.transformPoints2D = TransformPoints2D()
+
+ def norm_points(self, points, align_corners=False):
+ if align_corners:
+ # [0, SIZE-1] -> [-1, +1]
+ return points / torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2) * 2 - 1
+ else:
+ # [-0.5, SIZE-0.5] -> [-1, +1]
+ return (points * 2 + 1) / torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1
+
+ def denorm_points(self, points, align_corners=False):
+ if align_corners:
+ # [-1, +1] -> [0, SIZE-1]
+ return (points + 1) / 2 * torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2)
+ else:
+ # [-1, +1] -> [-0.5, SIZE-0.5]
+ return ((points + 1) * torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1) / 2
+
+ def preprocess(self, image, scale, center_w, center_h):
+ matrix = self.getCropMatrix.process(scale, center_w, center_h)
+ input_tensor = self.transformPerspective.process(image, matrix)
+ input_tensor = input_tensor[np.newaxis, :]
+
+ input_tensor = torch.from_numpy(input_tensor)
+ input_tensor = input_tensor.float().permute(0, 3, 1, 2)
+ input_tensor = input_tensor / 255.0 * 2.0 - 1.0
+
+ if self.config.device_id == -1:
+ input_tensor = input_tensor.cpu()
+ else:
+ input_tensor = input_tensor.to(self.config.device_id)
+
+ return input_tensor, matrix
+
+ def postprocess(self, srcPoints, coeff):
+ # dstPoints = self.transformPoints2D.process(srcPoints, coeff)
+ # matrix^(-1) * src = dst
+ # src = matrix * dst
+ dstPoints = np.zeros(srcPoints.shape, dtype=np.float32)
+ for i in range(srcPoints.shape[0]):
+ dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][1] * srcPoints[i][1] + coeff[0][2]
+ dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][1] * srcPoints[i][1] + coeff[1][2]
+ return dstPoints
+
+ def analyze(self, image, scale, center_w, center_h):
+ input_tensor, matrix = self.preprocess(image, scale, center_w, center_h)
+
+ if self.dl_framework == "pytorch":
+ with torch.no_grad():
+ output = self.alignment(input_tensor)
+ landmarks = output[-1][0]
+ else:
+ assert False
+
+ landmarks = self.denorm_points(landmarks)
+ landmarks = landmarks.data.cpu().numpy()[0]
+ landmarks = self.postprocess(landmarks, np.linalg.inv(matrix))
+
+ return landmarks
+
+# parser = argparse.ArgumentParser(description="Evaluation script")
+# args = parser.parse_args()
+# image_path = './rgb.png'
+# image = cv2.imread(image_path)
+#
+# use_gpu = False
+# ########### face detection ############
+# if use_gpu:
+# device = torch.device("cuda:0")
+# else:
+# device = torch.device("cpu")
+#
+# detector = FaceBoxesDetector('FaceBoxes', 'FaceBoxesV2/weights/FaceBoxesV2.pth', use_gpu, device)
+#
+# ########### facial alignment ############
+# model_path = './weights/68_keypoints_model.pkl'
+#
+# if use_gpu:
+# device_ids = [0]
+# else:
+# device_ids = [-1]
+#
+# args.config_name = 'alignment'
+# alignment = Alignment(args, model_path, dl_framework="pytorch", device_ids=device_ids)
+# image_draw = copy.deepcopy(image)
+#
+# ########### inference ############
+# ldk_list = []
+#
+# detections, _ = detector.detect(image, 0.9, 1)
+# for idx in range(len(detections)):
+# x1_ori = detections[idx][2]
+# y1_ori = detections[idx][3]
+# x2_ori = x1_ori + detections[idx][4]
+# y2_ori = y1_ori + detections[idx][5]
+#
+# scale = max(x2_ori - x1_ori, y2_ori - y1_ori) / 180
+# center_w = (x1_ori + x2_ori) / 2
+# center_h = (y1_ori + y2_ori) / 2
+# scale, center_w, center_h = float(scale), float(center_w), float(center_h)
+#
+# landmarks_pv = alignment.analyze(image, scale, center_w, center_h)
+#
+# for num in range(landmarks_pv.shape[0]):
+# cv2.circle(image_draw, (round(landmarks_pv[num][0]), round(landmarks_pv[num][1])),
+# 2, (0, 255, 0), -1)
+#
+# ldk_list.append([round(landmarks_pv[num][0]), round(landmarks_pv[num][1])])
+#
+# cv2.imshow("win", image_draw)
+#
+# # ldk_img = cv2.imread('/home/gyalex/Desktop/image_landmark_149/all.jpg')
+# # cv2.imshow("win1", ldk_img)
+#
+# cv2.waitKey(0)
+#
+# with open('./cord.txt', 'w') as f:
+# for num in range(len(ldk_list)):
+# s = str(ldk_list[num][0]) + ' ' + str(ldk_list[num][1]) + '\n'
+# f.write(s)
+#
+# f.close()
+
+
+
diff --git a/LAM_gpro/external/landmark_detection/infer_video.py b/LAM_gpro/external/landmark_detection/infer_video.py
new file mode 100644
index 0000000..4232c20
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/infer_video.py
@@ -0,0 +1,287 @@
+import cv2
+import math
+import copy
+import numpy as np
+import argparse
+import torch
+import json
+
+# private package
+from lib import utility
+from FaceBoxesV2.faceboxes_detector import *
+
+class GetCropMatrix():
+ """
+ from_shape -> transform_matrix
+ """
+
+ def __init__(self, image_size, target_face_scale, align_corners=False):
+ self.image_size = image_size
+ self.target_face_scale = target_face_scale
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def process(self, scale, center_w, center_h):
+ if self.align_corners:
+ to_w, to_h = self.image_size - 1, self.image_size - 1
+ else:
+ to_w, to_h = self.image_size, self.image_size
+
+ rot_mu = 0
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+ shift_xy_mu = (0, 0)
+ matrix = self._compose_rotate_and_scale(
+ rot_mu, scale_mu, shift_xy_mu,
+ from_center=[center_w, center_h],
+ to_center=[to_w / 2.0, to_h / 2.0])
+ return matrix
+
+
+class TransformPerspective():
+ """
+ image, matrix3x3 -> transformed_image
+ """
+
+ def __init__(self, image_size):
+ self.image_size = image_size
+
+ def process(self, image, matrix):
+ return cv2.warpPerspective(
+ image, matrix, dsize=(self.image_size, self.image_size),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+
+class TransformPoints2D():
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+
+ def process(self, srcPoints, matrix):
+ # nx3
+ desPoints = np.concatenate([srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1)
+ desPoints = desPoints @ np.transpose(matrix) # nx3
+ desPoints = desPoints[:, :2] / desPoints[:, [2, 2]]
+ return desPoints.astype(srcPoints.dtype)
+
+class Alignment:
+ def __init__(self, args, model_path, dl_framework, device_ids):
+ self.input_size = 256
+ self.target_face_scale = 1.0
+ self.dl_framework = dl_framework
+
+ # model
+ if self.dl_framework == "pytorch":
+ # conf
+ self.config = utility.get_config(args)
+ self.config.device_id = device_ids[0]
+ # set environment
+ utility.set_environment(self.config)
+ # self.config.init_instance()
+ # if self.config.logger is not None:
+ # self.config.logger.info("Loaded configure file %s: %s" % (args.config_name, self.config.id))
+ # self.config.logger.info("\n" + "\n".join(["%s: %s" % item for item in self.config.__dict__.items()]))
+
+ net = utility.get_net(self.config)
+ if device_ids == [-1]:
+ checkpoint = torch.load(model_path, map_location="cpu")
+ else:
+ checkpoint = torch.load(model_path)
+ net.load_state_dict(checkpoint["net"])
+
+ if self.config.device_id == -1:
+ net = net.cpu()
+ else:
+ net = net.to(self.config.device_id)
+
+ net.eval()
+ self.alignment = net
+ else:
+ assert False
+
+ self.getCropMatrix = GetCropMatrix(image_size=self.input_size, target_face_scale=self.target_face_scale,
+ align_corners=True)
+ self.transformPerspective = TransformPerspective(image_size=self.input_size)
+ self.transformPoints2D = TransformPoints2D()
+
+ def norm_points(self, points, align_corners=False):
+ if align_corners:
+ # [0, SIZE-1] -> [-1, +1]
+ return points / torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2) * 2 - 1
+ else:
+ # [-0.5, SIZE-0.5] -> [-1, +1]
+ return (points * 2 + 1) / torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1
+
+ def denorm_points(self, points, align_corners=False):
+ if align_corners:
+ # [-1, +1] -> [0, SIZE-1]
+ return (points + 1) / 2 * torch.tensor([self.input_size - 1, self.input_size - 1]).to(points).view(1, 1, 2)
+ else:
+ # [-1, +1] -> [-0.5, SIZE-0.5]
+ return ((points + 1) * torch.tensor([self.input_size, self.input_size]).to(points).view(1, 1, 2) - 1) / 2
+
+ def preprocess(self, image, scale, center_w, center_h):
+ matrix = self.getCropMatrix.process(scale, center_w, center_h)
+ input_tensor = self.transformPerspective.process(image, matrix)
+ input_tensor = input_tensor[np.newaxis, :]
+
+ input_tensor = torch.from_numpy(input_tensor)
+ input_tensor = input_tensor.float().permute(0, 3, 1, 2)
+ input_tensor = input_tensor / 255.0 * 2.0 - 1.0
+
+ if self.config.device_id == -1:
+ input_tensor = input_tensor.cpu()
+ else:
+ input_tensor = input_tensor.to(self.config.device_id)
+
+ return input_tensor, matrix
+
+ def postprocess(self, srcPoints, coeff):
+ # dstPoints = self.transformPoints2D.process(srcPoints, coeff)
+ # matrix^(-1) * src = dst
+ # src = matrix * dst
+ dstPoints = np.zeros(srcPoints.shape, dtype=np.float32)
+ for i in range(srcPoints.shape[0]):
+ dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][1] * srcPoints[i][1] + coeff[0][2]
+ dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][1] * srcPoints[i][1] + coeff[1][2]
+ return dstPoints
+
+ def analyze(self, image, scale, center_w, center_h):
+ input_tensor, matrix = self.preprocess(image, scale, center_w, center_h)
+
+ if self.dl_framework == "pytorch":
+ with torch.no_grad():
+ output = self.alignment(input_tensor)
+ landmarks = output[-1][0]
+ else:
+ assert False
+
+ landmarks = self.denorm_points(landmarks)
+ landmarks = landmarks.data.cpu().numpy()[0]
+ landmarks = self.postprocess(landmarks, np.linalg.inv(matrix))
+
+ return landmarks
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="inference script")
+ parser.add_argument('--video_path', type=str, help='Path to videos',default='/media/yuanzhen/HH/DATASET/VFTH/TESTVIDEO/Clip+7CzHzeeVRlE+P0+C0+F101007-101139.mp4')
+ args = parser.parse_args()
+
+ # args.video_path = '/media/gyalex/Data/flame/ph_test/test.mp4'
+
+ current_path = os.getcwd()
+
+ use_gpu = True
+ ########### face detection ############
+ if use_gpu:
+ device = torch.device("cuda:0")
+ else:
+ device = torch.device("cpu")
+
+ current_path = os.getcwd()
+ det_model_path = '/home/yuanzhen/code/landmark_detection/FaceBoxesV2/weights/FaceBoxesV2.pth'
+ detector = FaceBoxesDetector('FaceBoxes', det_model_path, use_gpu, device)
+
+ ########### facial alignment ############
+ model_path = '/home/yuanzhen/code/landmark_detection/weights/68_keypoints_model.pkl'
+
+ if use_gpu:
+ device_ids = [0]
+ else:
+ device_ids = [-1]
+
+ args.config_name = 'alignment'
+ alignment = Alignment(args, model_path, dl_framework="pytorch", device_ids=device_ids)
+
+ video_file = args.video_path
+ cap = cv2.VideoCapture(video_file)
+ frame_width = int(cap.get(3))
+ frame_height = int(cap.get(4))
+
+ # out_video_file = './output_video.mp4'
+ # fps = 30
+ # size = (frame_width, frame_height)
+ # out = cv2.VideoWriter(out_video_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
+
+ count = 0
+ kpts_code = dict()
+
+ keypoint_data_path = args.video_path.replace('.mp4','.json')
+ with open(keypoint_data_path,'r') as f:
+ keypoint_data = json.load(f)
+
+ ########### inference ############
+ path = video_file[:-4]
+ while(cap.isOpened()):
+ ret, image = cap.read()
+
+ if ret:
+ detections, _ = detector.detect(image, 0.8, 1)
+ image_draw = copy.deepcopy(image)
+
+ cv2.imwrite(os.path.join(path, 'image', str(count+1)+'.png'), image_draw)
+
+ for idx in range(len(detections)):
+ x1_ori = detections[idx][2]
+ y1_ori = detections[idx][3]
+ x2_ori = x1_ori + detections[idx][4]
+ y2_ori = y1_ori + detections[idx][5]
+
+ scale = max(x2_ori - x1_ori, y2_ori - y1_ori) / 180
+ center_w = (x1_ori + x2_ori) / 2
+ center_h = (y1_ori + y2_ori) / 2
+ scale, center_w, center_h = float(scale), float(center_w), float(center_h)
+
+ # landmarks_pv = alignment.analyze(image, scale, center_w, center_h)
+ landmarks_pv = np.array(keypoint_data[str(count+1)+'.png'])
+
+ landmarks_pv_list = landmarks_pv.tolist()
+
+ for num in range(landmarks_pv.shape[0]):
+ cv2.circle(image_draw, (round(landmarks_pv[num][0]), round(landmarks_pv[num][1])),
+ 2, (0, 255, 0), -1)
+ cv2.putText(image_draw, str(num),
+ (round(landmarks_pv[num][0]) + 5, round(landmarks_pv[num][1]) + 5), # 文本位置
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
+
+ kpts_code[str(count+1)+'.png'] = landmarks_pv_list
+ cv2.imwrite(os.path.join(path, 'landmark', str(count+1)+'.png'), image_draw)
+ else:
+ break
+
+ count += 1
+
+ cap.release()
+ # out.release()
+ # cv2.destroyAllWindows()
+
+ path = video_file[:-4]
+ json.dump(kpts_code, open(os.path.join(path, 'keypoint.json'), 'w'))
+
+ print(path)
+
+
+
diff --git a/LAM_gpro/external/landmark_detection/lib/__init__.py b/LAM_gpro/external/landmark_detection/lib/__init__.py
new file mode 100644
index 0000000..ff08a78
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/__init__.py
@@ -0,0 +1,9 @@
+from .dataset import get_encoder, get_decoder
+from .dataset import AlignmentDataset, Augmentation
+from .backbone import StackedHGNetV1
+from .metric import NME, Accuracy
+from .utils import time_print, time_string, time_for_file, time_string_short
+from .utils import convert_secs2time, convert_size2str
+
+from .utility import get_dataloader, get_config, get_net, get_criterions
+from .utility import get_optimizer, get_scheduler
diff --git a/LAM_gpro/external/landmark_detection/lib/backbone/__init__.py b/LAM_gpro/external/landmark_detection/lib/backbone/__init__.py
new file mode 100644
index 0000000..b967103
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/backbone/__init__.py
@@ -0,0 +1,5 @@
+from .stackedHGNetV1 import StackedHGNetV1
+
+__all__ = [
+ "StackedHGNetV1",
+]
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/lib/backbone/core/coord_conv.py b/LAM_gpro/external/landmark_detection/lib/backbone/core/coord_conv.py
new file mode 100644
index 0000000..0eb8e2d
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/backbone/core/coord_conv.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn as nn
+
+
+class AddCoordsTh(nn.Module):
+ def __init__(self, x_dim, y_dim, with_r=False, with_boundary=False):
+ super(AddCoordsTh, self).__init__()
+ self.x_dim = x_dim
+ self.y_dim = y_dim
+ self.with_r = with_r
+ self.with_boundary = with_boundary
+
+ def forward(self, input_tensor, heatmap=None):
+ """
+ input_tensor: (batch, c, x_dim, y_dim)
+ """
+ batch_size_tensor = input_tensor.shape[0]
+
+ xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32).to(input_tensor)
+ xx_ones = xx_ones.unsqueeze(-1)
+
+ xx_range = torch.arange(self.x_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
+ xx_range = xx_range.unsqueeze(1)
+
+ xx_channel = torch.matmul(xx_ones.float(), xx_range.float())
+ xx_channel = xx_channel.unsqueeze(-1)
+
+ yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32).to(input_tensor)
+ yy_ones = yy_ones.unsqueeze(1)
+
+ yy_range = torch.arange(self.y_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
+ yy_range = yy_range.unsqueeze(-1)
+
+ yy_channel = torch.matmul(yy_range.float(), yy_ones.float())
+ yy_channel = yy_channel.unsqueeze(-1)
+
+ xx_channel = xx_channel.permute(0, 3, 2, 1)
+ yy_channel = yy_channel.permute(0, 3, 2, 1)
+
+ xx_channel = xx_channel / (self.x_dim - 1)
+ yy_channel = yy_channel / (self.y_dim - 1)
+
+ xx_channel = xx_channel * 2 - 1
+ yy_channel = yy_channel * 2 - 1
+
+ xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1)
+ yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1)
+
+ if self.with_boundary and type(heatmap) != type(None):
+ boundary_channel = torch.clamp(heatmap[:, -1:, :, :],
+ 0.0, 1.0)
+
+ zero_tensor = torch.zeros_like(xx_channel).to(xx_channel)
+ xx_boundary_channel = torch.where(boundary_channel>0.05,
+ xx_channel, zero_tensor)
+ yy_boundary_channel = torch.where(boundary_channel>0.05,
+ yy_channel, zero_tensor)
+ ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1)
+
+
+ if self.with_r:
+ rr = torch.sqrt(torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2))
+ rr = rr / torch.max(rr)
+ ret = torch.cat([ret, rr], dim=1)
+
+ if self.with_boundary and type(heatmap) != type(None):
+ ret = torch.cat([ret, xx_boundary_channel,
+ yy_boundary_channel], dim=1)
+ return ret
+
+
+class CoordConvTh(nn.Module):
+ """CoordConv layer as in the paper."""
+ def __init__(self, x_dim, y_dim, with_r, with_boundary,
+ in_channels, out_channels, first_one=False, relu=False, bn=False, *args, **kwargs):
+ super(CoordConvTh, self).__init__()
+ self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim, with_r=with_r,
+ with_boundary=with_boundary)
+ in_channels += 2
+ if with_r:
+ in_channels += 1
+ if with_boundary and not first_one:
+ in_channels += 2
+ self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, *args, **kwargs)
+ self.relu = nn.ReLU() if relu else None
+ self.bn = nn.BatchNorm2d(out_channels) if bn else None
+
+ self.with_boundary = with_boundary
+ self.first_one = first_one
+
+
+ def forward(self, input_tensor, heatmap=None):
+ assert (self.with_boundary and not self.first_one) == (heatmap is not None)
+ ret = self.addcoords(input_tensor, heatmap)
+ ret = self.conv(ret)
+ if self.bn is not None:
+ ret = self.bn(ret)
+ if self.relu is not None:
+ ret = self.relu(ret)
+
+ return ret
+
+
+'''
+An alternative implementation for PyTorch with auto-infering the x-y dimensions.
+'''
+class AddCoords(nn.Module):
+
+ def __init__(self, with_r=False):
+ super().__init__()
+ self.with_r = with_r
+
+ def forward(self, input_tensor):
+ """
+ Args:
+ input_tensor: shape(batch, channel, x_dim, y_dim)
+ """
+ batch_size, _, x_dim, y_dim = input_tensor.size()
+
+ xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1).to(input_tensor)
+ yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(1, 2).to(input_tensor)
+
+ xx_channel = xx_channel / (x_dim - 1)
+ yy_channel = yy_channel / (y_dim - 1)
+
+ xx_channel = xx_channel * 2 - 1
+ yy_channel = yy_channel * 2 - 1
+
+ xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
+ yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
+
+ ret = torch.cat([
+ input_tensor,
+ xx_channel.type_as(input_tensor),
+ yy_channel.type_as(input_tensor)], dim=1)
+
+ if self.with_r:
+ rr = torch.sqrt(torch.pow(xx_channel - 0.5, 2) + torch.pow(yy_channel - 0.5, 2))
+ ret = torch.cat([ret, rr], dim=1)
+
+ return ret
+
+
+class CoordConv(nn.Module):
+
+ def __init__(self, in_channels, out_channels, with_r=False, **kwargs):
+ super().__init__()
+ self.addcoords = AddCoords(with_r=with_r)
+ in_channels += 2
+ if with_r:
+ in_channels += 1
+ self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
+
+ def forward(self, x):
+ ret = self.addcoords(x)
+ ret = self.conv(ret)
+ return ret
diff --git a/LAM_gpro/external/landmark_detection/lib/backbone/stackedHGNetV1.py b/LAM_gpro/external/landmark_detection/lib/backbone/stackedHGNetV1.py
new file mode 100644
index 0000000..f10264d
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/backbone/stackedHGNetV1.py
@@ -0,0 +1,307 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .core.coord_conv import CoordConvTh
+from external.landmark_detection.lib.dataset import get_decoder
+
+
+
+class Activation(nn.Module):
+ def __init__(self, kind: str = 'relu', channel=None):
+ super().__init__()
+ self.kind = kind
+
+ if '+' in kind:
+ norm_str, act_str = kind.split('+')
+ else:
+ norm_str, act_str = 'none', kind
+
+ self.norm_fn = {
+ 'in': F.instance_norm,
+ 'bn': nn.BatchNorm2d(channel),
+ 'bn_noaffine': nn.BatchNorm2d(channel, affine=False, track_running_stats=True),
+ 'none': None
+ }[norm_str]
+
+ self.act_fn = {
+ 'relu': F.relu,
+ 'softplus': nn.Softplus(),
+ 'exp': torch.exp,
+ 'sigmoid': torch.sigmoid,
+ 'tanh': torch.tanh,
+ 'none': None
+ }[act_str]
+
+ self.channel = channel
+
+ def forward(self, x):
+ if self.norm_fn is not None:
+ x = self.norm_fn(x)
+ if self.act_fn is not None:
+ x = self.act_fn(x)
+ return x
+
+ def extra_repr(self):
+ return f'kind={self.kind}, channel={self.channel}'
+
+
+class ConvBlock(nn.Module):
+ def __init__(self, inp_dim, out_dim, kernel_size=3, stride=1, bn=False, relu=True, groups=1):
+ super(ConvBlock, self).__init__()
+ self.inp_dim = inp_dim
+ self.conv = nn.Conv2d(inp_dim, out_dim, kernel_size,
+ stride, padding=(kernel_size - 1) // 2, groups=groups, bias=True)
+ self.relu = None
+ self.bn = None
+ if relu:
+ self.relu = nn.ReLU()
+ if bn:
+ self.bn = nn.BatchNorm2d(out_dim)
+
+ def forward(self, x):
+ x = self.conv(x)
+ if self.bn is not None:
+ x = self.bn(x)
+ if self.relu is not None:
+ x = self.relu(x)
+ return x
+
+
+class ResBlock(nn.Module):
+ def __init__(self, inp_dim, out_dim, mid_dim=None):
+ super(ResBlock, self).__init__()
+ if mid_dim is None:
+ mid_dim = out_dim // 2
+ self.relu = nn.ReLU()
+ self.bn1 = nn.BatchNorm2d(inp_dim)
+ self.conv1 = ConvBlock(inp_dim, mid_dim, 1, relu=False)
+ self.bn2 = nn.BatchNorm2d(mid_dim)
+ self.conv2 = ConvBlock(mid_dim, mid_dim, 3, relu=False)
+ self.bn3 = nn.BatchNorm2d(mid_dim)
+ self.conv3 = ConvBlock(mid_dim, out_dim, 1, relu=False)
+ self.skip_layer = ConvBlock(inp_dim, out_dim, 1, relu=False)
+ if inp_dim == out_dim:
+ self.need_skip = False
+ else:
+ self.need_skip = True
+
+ def forward(self, x):
+ if self.need_skip:
+ residual = self.skip_layer(x)
+ else:
+ residual = x
+ out = self.bn1(x)
+ out = self.relu(out)
+ out = self.conv1(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+ out = self.conv2(out)
+ out = self.bn3(out)
+ out = self.relu(out)
+ out = self.conv3(out)
+ out += residual
+ return out
+
+
+class Hourglass(nn.Module):
+ def __init__(self, n, f, increase=0, up_mode='nearest',
+ add_coord=False, first_one=False, x_dim=64, y_dim=64):
+ super(Hourglass, self).__init__()
+ nf = f + increase
+
+ Block = ResBlock
+
+ if add_coord:
+ self.coordconv = CoordConvTh(x_dim=x_dim, y_dim=y_dim,
+ with_r=True, with_boundary=True,
+ relu=False, bn=False,
+ in_channels=f, out_channels=f,
+ first_one=first_one,
+ kernel_size=1,
+ stride=1, padding=0)
+ else:
+ self.coordconv = None
+ self.up1 = Block(f, f)
+
+ # Lower branch
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+ self.low1 = Block(f, nf)
+ self.n = n
+ # Recursive hourglass
+ if self.n > 1:
+ self.low2 = Hourglass(n=n - 1, f=nf, increase=increase, up_mode=up_mode, add_coord=False)
+ else:
+ self.low2 = Block(nf, nf)
+ self.low3 = Block(nf, f)
+ self.up2 = nn.Upsample(scale_factor=2, mode=up_mode)
+
+ def forward(self, x, heatmap=None):
+ if self.coordconv is not None:
+ x = self.coordconv(x, heatmap)
+ up1 = self.up1(x)
+ pool1 = self.pool1(x)
+ low1 = self.low1(pool1)
+ low2 = self.low2(low1)
+ low3 = self.low3(low2)
+ up2 = self.up2(low3)
+ return up1 + up2
+
+
+class E2HTransform(nn.Module):
+ def __init__(self, edge_info, num_points, num_edges):
+ super().__init__()
+
+ e2h_matrix = np.zeros([num_points, num_edges])
+ for edge_id, isclosed_indices in enumerate(edge_info):
+ is_closed, indices = isclosed_indices
+ for point_id in indices:
+ e2h_matrix[point_id, edge_id] = 1
+ e2h_matrix = torch.from_numpy(e2h_matrix).float()
+
+ # pn x en x 1 x 1.
+ self.register_buffer('weight', e2h_matrix.view(
+ e2h_matrix.size(0), e2h_matrix.size(1), 1, 1))
+
+ # some keypoints are not coverred by any edges,
+ # in these cases, we must add a constant bias to their heatmap weights.
+ bias = ((e2h_matrix @ torch.ones(e2h_matrix.size(1)).to(
+ e2h_matrix)) < 0.5).to(e2h_matrix)
+ # pn x 1.
+ self.register_buffer('bias', bias)
+
+ def forward(self, edgemaps):
+ # input: batch_size x en x hw x hh.
+ # output: batch_size x pn x hw x hh.
+ return F.conv2d(edgemaps, weight=self.weight, bias=self.bias)
+
+
+class StackedHGNetV1(nn.Module):
+ def __init__(self, config, classes_num, edge_info,
+ nstack=4, nlevels=4, in_channel=256, increase=0,
+ add_coord=True, decoder_type='default'):
+ super(StackedHGNetV1, self).__init__()
+
+ self.cfg = config
+ self.coder_type = decoder_type
+ self.decoder = get_decoder(decoder_type=decoder_type)
+ self.nstack = nstack
+ self.add_coord = add_coord
+
+ self.num_heats = classes_num[0]
+
+ if self.add_coord:
+ convBlock = CoordConvTh(x_dim=self.cfg.width, y_dim=self.cfg.height,
+ with_r=True, with_boundary=False,
+ relu=True, bn=True,
+ in_channels=3, out_channels=64,
+ kernel_size=7,
+ stride=2, padding=3)
+ else:
+ convBlock = ConvBlock(3, 64, 7, 2, bn=True, relu=True)
+
+ pool = nn.MaxPool2d(kernel_size=2, stride=2)
+
+ Block = ResBlock
+
+ self.pre = nn.Sequential(
+ convBlock,
+ Block(64, 128),
+ pool,
+ Block(128, 128),
+ Block(128, in_channel)
+ )
+
+ self.hgs = nn.ModuleList(
+ [Hourglass(n=nlevels, f=in_channel, increase=increase, add_coord=self.add_coord, first_one=(_ == 0),
+ x_dim=int(self.cfg.width / self.nstack), y_dim=int(self.cfg.height / self.nstack))
+ for _ in range(nstack)])
+
+ self.features = nn.ModuleList([
+ nn.Sequential(
+ Block(in_channel, in_channel),
+ ConvBlock(in_channel, in_channel, 1, bn=True, relu=True)
+ ) for _ in range(nstack)])
+
+ self.out_heatmaps = nn.ModuleList(
+ [ConvBlock(in_channel, self.num_heats, 1, relu=False, bn=False)
+ for _ in range(nstack)])
+
+ if self.cfg.use_AAM:
+ self.num_edges = classes_num[1]
+ self.num_points = classes_num[2]
+
+ self.e2h_transform = E2HTransform(edge_info, self.num_points, self.num_edges)
+ self.out_edgemaps = nn.ModuleList(
+ [ConvBlock(in_channel, self.num_edges, 1, relu=False, bn=False)
+ for _ in range(nstack)])
+ self.out_pointmaps = nn.ModuleList(
+ [ConvBlock(in_channel, self.num_points, 1, relu=False, bn=False)
+ for _ in range(nstack)])
+ self.merge_edgemaps = nn.ModuleList(
+ [ConvBlock(self.num_edges, in_channel, 1, relu=False, bn=False)
+ for _ in range(nstack - 1)])
+ self.merge_pointmaps = nn.ModuleList(
+ [ConvBlock(self.num_points, in_channel, 1, relu=False, bn=False)
+ for _ in range(nstack - 1)])
+ self.edgemap_act = Activation("sigmoid", self.num_edges)
+ self.pointmap_act = Activation("sigmoid", self.num_points)
+
+ self.merge_features = nn.ModuleList(
+ [ConvBlock(in_channel, in_channel, 1, relu=False, bn=False)
+ for _ in range(nstack - 1)])
+ self.merge_heatmaps = nn.ModuleList(
+ [ConvBlock(self.num_heats, in_channel, 1, relu=False, bn=False)
+ for _ in range(nstack - 1)])
+
+ self.nstack = nstack
+
+ self.heatmap_act = Activation("in+relu", self.num_heats)
+
+ self.inference = False
+
+ def set_inference(self, inference):
+ self.inference = inference
+
+ def forward(self, x):
+ x = self.pre(x)
+
+ y, fusionmaps = [], []
+ heatmaps = None
+ for i in range(self.nstack):
+ hg = self.hgs[i](x, heatmap=heatmaps)
+ feature = self.features[i](hg)
+
+ heatmaps0 = self.out_heatmaps[i](feature)
+ heatmaps = self.heatmap_act(heatmaps0)
+
+ if self.cfg.use_AAM:
+ pointmaps0 = self.out_pointmaps[i](feature)
+ pointmaps = self.pointmap_act(pointmaps0)
+ edgemaps0 = self.out_edgemaps[i](feature)
+ edgemaps = self.edgemap_act(edgemaps0)
+ mask = self.e2h_transform(edgemaps) * pointmaps
+ fusion_heatmaps = mask * heatmaps
+ else:
+ fusion_heatmaps = heatmaps
+
+ landmarks = self.decoder.get_coords_from_heatmap(fusion_heatmaps)
+
+ if i < self.nstack - 1:
+ x = x + self.merge_features[i](feature) + \
+ self.merge_heatmaps[i](heatmaps)
+ if self.cfg.use_AAM:
+ x += self.merge_pointmaps[i](pointmaps)
+ x += self.merge_edgemaps[i](edgemaps)
+
+ y.append(landmarks)
+ if self.cfg.use_AAM:
+ y.append(pointmaps)
+ y.append(edgemaps)
+
+ fusionmaps.append(fusion_heatmaps)
+
+ return y, fusionmaps, landmarks
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/__init__.py b/LAM_gpro/external/landmark_detection/lib/dataset/__init__.py
new file mode 100644
index 0000000..3380c4b
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/__init__.py
@@ -0,0 +1,11 @@
+from .encoder import get_encoder
+from .decoder import get_decoder
+from .augmentation import Augmentation
+from .alignmentDataset import AlignmentDataset
+
+__all__ = [
+ "Augmentation",
+ "AlignmentDataset",
+ "get_encoder",
+ "get_decoder"
+]
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/alignmentDataset.py b/LAM_gpro/external/landmark_detection/lib/dataset/alignmentDataset.py
new file mode 100644
index 0000000..236777e
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/alignmentDataset.py
@@ -0,0 +1,316 @@
+import os
+import sys
+import cv2
+import math
+import copy
+import hashlib
+import imageio
+import numpy as np
+import pandas as pd
+from scipy import interpolate
+from PIL import Image, ImageEnhance, ImageFile
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+sys.path.append("./")
+from external.landmark_detection.lib.dataset.augmentation import Augmentation
+from external.landmark_detection.lib.dataset.encoder import get_encoder
+
+
+class AlignmentDataset(Dataset):
+
+ def __init__(self, tsv_flie, image_dir="", transform=None,
+ width=256, height=256, channels=3,
+ means=(127.5, 127.5, 127.5), scale=1 / 127.5,
+ classes_num=None, crop_op=True, aug_prob=0.0, edge_info=None, flip_mapping=None, is_train=True,
+ encoder_type='default',
+ ):
+ super(AlignmentDataset, self).__init__()
+ self.use_AAM = True
+ self.encoder_type = encoder_type
+ self.encoder = get_encoder(height, width, encoder_type=encoder_type)
+ self.items = pd.read_csv(tsv_flie, sep="\t")
+ self.image_dir = image_dir
+ self.landmark_num = classes_num[0]
+ self.transform = transform
+
+ self.image_width = width
+ self.image_height = height
+ self.channels = channels
+ assert self.image_width == self.image_height
+
+ self.means = means
+ self.scale = scale
+
+ self.aug_prob = aug_prob
+ self.edge_info = edge_info
+ self.is_train = is_train
+ std_lmk_5pts = np.array([
+ 196.0, 226.0,
+ 316.0, 226.0,
+ 256.0, 286.0,
+ 220.0, 360.4,
+ 292.0, 360.4], np.float32) / 256.0 - 1.0
+ std_lmk_5pts = np.reshape(std_lmk_5pts, (5, 2)) # [-1 1]
+ target_face_scale = 1.0 if crop_op else 1.25
+
+ self.augmentation = Augmentation(
+ is_train=self.is_train,
+ aug_prob=self.aug_prob,
+ image_size=self.image_width,
+ crop_op=crop_op,
+ std_lmk_5pts=std_lmk_5pts,
+ target_face_scale=target_face_scale,
+ flip_rate=0.5,
+ flip_mapping=flip_mapping,
+ random_shift_sigma=0.05,
+ random_rot_sigma=math.pi / 180 * 18,
+ random_scale_sigma=0.1,
+ random_gray_rate=0.2,
+ random_occ_rate=0.4,
+ random_blur_rate=0.3,
+ random_gamma_rate=0.2,
+ random_nose_fusion_rate=0.2)
+
+ def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'):
+ # Check that any part of the gaussian is in-bounds
+ tmp_size = sigma * 3
+ ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)]
+ br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)]
+ if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1 or
+ br[0] - 1 < 0 or br[1] - 1 < 0):
+ # If not, just return the image as is
+ return img
+
+ # Generate gaussian
+ size = 2 * tmp_size + 1
+ x = np.arange(0, size, 1, np.float32)
+ y = x[:, np.newaxis]
+ x0 = y0 = size // 2
+ # The gaussian is not normalized, we want the center value to equal 1
+ if label_type == 'Gaussian':
+ g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
+ else:
+ g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5)
+
+ # Usable gaussian range
+ g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
+ g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
+ # Image range
+ img_x = max(0, ul[0]), min(br[0], img.shape[1])
+ img_y = max(0, ul[1]), min(br[1], img.shape[0])
+
+ img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+ return img
+
+ def _polylines(self, img, lmks, is_closed, color=255, thickness=1, draw_mode=cv2.LINE_AA,
+ interpolate_mode=cv2.INTER_AREA, scale=4):
+ h, w = img.shape
+ img_scale = cv2.resize(img, (w * scale, h * scale), interpolation=interpolate_mode)
+ lmks_scale = (lmks * scale + 0.5).astype(np.int32)
+ cv2.polylines(img_scale, [lmks_scale], is_closed, color, thickness * scale, draw_mode)
+ img = cv2.resize(img_scale, (w, h), interpolation=interpolate_mode)
+ return img
+
+ def _generate_edgemap(self, points, scale=0.25, thickness=1):
+ h, w = self.image_height, self.image_width
+ edgemaps = []
+ for is_closed, indices in self.edge_info:
+ edgemap = np.zeros([h, w], dtype=np.float32)
+ # align_corners: False.
+ part = copy.deepcopy(points[np.array(indices)])
+
+ part = self._fit_curve(part, is_closed)
+ part[:, 0] = np.clip(part[:, 0], 0, w - 1)
+ part[:, 1] = np.clip(part[:, 1], 0, h - 1)
+ edgemap = self._polylines(edgemap, part, is_closed, 255, thickness)
+
+ edgemaps.append(edgemap)
+ edgemaps = np.stack(edgemaps, axis=0) / 255.0
+ edgemaps = torch.from_numpy(edgemaps).float().unsqueeze(0)
+ edgemaps = F.interpolate(edgemaps, size=(int(w * scale), int(h * scale)), mode='bilinear',
+ align_corners=False).squeeze()
+ return edgemaps
+
+ def _fit_curve(self, lmks, is_closed=False, density=5):
+ try:
+ x = lmks[:, 0].copy()
+ y = lmks[:, 1].copy()
+ if is_closed:
+ x = np.append(x, x[0])
+ y = np.append(y, y[0])
+ tck, u = interpolate.splprep([x, y], s=0, per=is_closed, k=3)
+ # bins = (x.shape[0] - 1) * density + 1
+ # lmk_x, lmk_y = interpolate.splev(np.linspace(0, 1, bins), f)
+ intervals = np.array([])
+ for i in range(len(u) - 1):
+ intervals = np.concatenate((intervals, np.linspace(u[i], u[i + 1], density, endpoint=False)))
+ if not is_closed:
+ intervals = np.concatenate((intervals, [u[-1]]))
+ lmk_x, lmk_y = interpolate.splev(intervals, tck, der=0)
+ # der_x, der_y = interpolate.splev(intervals, tck, der=1)
+ curve_lmks = np.stack([lmk_x, lmk_y], axis=-1)
+ # curve_ders = np.stack([der_x, der_y], axis=-1)
+ # origin_indices = np.arange(0, curve_lmks.shape[0], density)
+
+ return curve_lmks
+ except:
+ return lmks
+
+ def _image_id(self, image_path):
+ if not os.path.exists(image_path):
+ image_path = os.path.join(self.image_dir, image_path)
+ return hashlib.md5(open(image_path, "rb").read()).hexdigest()
+
+ def _load_image(self, image_path):
+ if not os.path.exists(image_path):
+ image_path = os.path.join(self.image_dir, image_path)
+
+ try:
+ # img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR)#HWC, BGR, [0-255]
+ img = cv2.imread(image_path, cv2.IMREAD_COLOR) # HWC, BGR, [0-255]
+ assert img is not None and len(img.shape) == 3 and img.shape[2] == 3
+ except:
+ try:
+ img = imageio.imread(image_path) # HWC, RGB, [0-255]
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # HWC, BGR, [0-255]
+ assert img is not None and len(img.shape) == 3 and img.shape[2] == 3
+ except:
+ try:
+ gifImg = imageio.mimread(image_path) # BHWC, RGB, [0-255]
+ img = gifImg[0] # HWC, RGB, [0-255]
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # HWC, BGR, [0-255]
+ assert img is not None and len(img.shape) == 3 and img.shape[2] == 3
+ except:
+ img = None
+ return img
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def _transformPoints2D(self, points, matrix):
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+ dtype = points.dtype
+
+ # nx3
+ points = np.concatenate([points, np.ones_like(points[:, [0]])], axis=1)
+ points = points @ np.transpose(matrix) # nx3
+ points = points[:, :2] / points[:, [2, 2]]
+ return points.astype(dtype)
+
+ def _transformPerspective(self, image, matrix, target_shape):
+ """
+ image, matrix3x3 -> transformed_image
+ """
+ return cv2.warpPerspective(
+ image, matrix,
+ dsize=(target_shape[1], target_shape[0]),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+ def _norm_points(self, points, h, w, align_corners=False):
+ if align_corners:
+ # [0, SIZE-1] -> [-1, +1]
+ des_points = points / torch.tensor([w - 1, h - 1]).to(points).view(1, 2) * 2 - 1
+ else:
+ # [-0.5, SIZE-0.5] -> [-1, +1]
+ des_points = (points * 2 + 1) / torch.tensor([w, h]).to(points).view(1, 2) - 1
+ des_points = torch.clamp(des_points, -1, 1)
+ return des_points
+
+ def _denorm_points(self, points, h, w, align_corners=False):
+ if align_corners:
+ # [-1, +1] -> [0, SIZE-1]
+ des_points = (points + 1) / 2 * torch.tensor([w - 1, h - 1]).to(points).view(1, 1, 2)
+ else:
+ # [-1, +1] -> [-0.5, SIZE-0.5]
+ des_points = ((points + 1) * torch.tensor([w, h]).to(points).view(1, 1, 2) - 1) / 2
+ return des_points
+
+ def __len__(self):
+ return len(self.items)
+
+ def __getitem__(self, index):
+ sample = dict()
+
+ image_path = self.items.iloc[index, 0]
+ landmarks_5pts = self.items.iloc[index, 1]
+ landmarks_5pts = np.array(list(map(float, landmarks_5pts.split(","))), dtype=np.float32).reshape(5, 2)
+ landmarks_target = self.items.iloc[index, 2]
+ landmarks_target = np.array(list(map(float, landmarks_target.split(","))), dtype=np.float32).reshape(
+ self.landmark_num, 2)
+ scale = float(self.items.iloc[index, 3])
+ center_w, center_h = float(self.items.iloc[index, 4]), float(self.items.iloc[index, 5])
+ if len(self.items.iloc[index]) > 6:
+ tags = np.array(list(map(lambda x: int(float(x)), self.items.iloc[index, 6].split(","))))
+ else:
+ tags = np.array([])
+
+ # image & keypoints alignment
+ image_path = image_path.replace('\\', '/')
+ # wflw testset
+ image_path = image_path.replace(
+ '//msr-facestore/Workspace/MSRA_EP_Allergan/users/yanghuan/training_data/wflw/rawImages/', '')
+ # trainset
+ image_path = image_path.replace('./rawImages/', '')
+ image_path = os.path.join(self.image_dir, image_path)
+
+ # image path
+ sample["image_path"] = image_path
+
+ img = self._load_image(image_path) # HWC, BGR, [0, 255]
+ assert img is not None
+
+ # augmentation
+ # landmarks_target = [-0.5, edge-0.5]
+ img, landmarks_target, matrix = \
+ self.augmentation.process(img, landmarks_target, landmarks_5pts, scale, center_w, center_h)
+
+ landmarks = self._norm_points(torch.from_numpy(landmarks_target), self.image_height, self.image_width)
+
+ sample["label"] = [landmarks, ]
+
+ if self.use_AAM:
+ pointmap = self.encoder.generate_heatmap(landmarks_target)
+ edgemap = self._generate_edgemap(landmarks_target)
+ sample["label"] += [pointmap, edgemap]
+
+ sample['matrix'] = matrix
+
+ # image normalization
+ img = img.transpose(2, 0, 1).astype(np.float32) # CHW, BGR, [0, 255]
+ img[0, :, :] = (img[0, :, :] - self.means[0]) * self.scale
+ img[1, :, :] = (img[1, :, :] - self.means[1]) * self.scale
+ img[2, :, :] = (img[2, :, :] - self.means[2]) * self.scale
+ sample["data"] = torch.from_numpy(img) # CHW, BGR, [-1, 1]
+
+ sample["tags"] = tags
+
+ return sample
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/augmentation.py b/LAM_gpro/external/landmark_detection/lib/dataset/augmentation.py
new file mode 100644
index 0000000..0694d31
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/augmentation.py
@@ -0,0 +1,355 @@
+import os
+import cv2
+import math
+import random
+import numpy as np
+from skimage import transform
+
+
+class Augmentation:
+ def __init__(self,
+ is_train=True,
+ aug_prob=1.0,
+ image_size=256,
+ crop_op=True,
+ std_lmk_5pts=None,
+ target_face_scale=1.0,
+ flip_rate=0.5,
+ flip_mapping=None,
+ random_shift_sigma=0.05,
+ random_rot_sigma=math.pi/180*18,
+ random_scale_sigma=0.1,
+ random_gray_rate=0.2,
+ random_occ_rate=0.4,
+ random_blur_rate=0.3,
+ random_gamma_rate=0.2,
+ random_nose_fusion_rate=0.2):
+ self.is_train = is_train
+ self.aug_prob = aug_prob
+ self.crop_op = crop_op
+ self._flip = Flip(flip_mapping, flip_rate)
+ if self.crop_op:
+ self._cropMatrix = GetCropMatrix(
+ image_size=image_size,
+ target_face_scale=target_face_scale,
+ align_corners=True)
+ else:
+ self._alignMatrix = GetAlignMatrix(
+ image_size=image_size,
+ target_face_scale=target_face_scale,
+ std_lmk_5pts=std_lmk_5pts)
+ self._randomGeometryMatrix = GetRandomGeometryMatrix(
+ target_shape=(image_size, image_size),
+ from_shape=(image_size, image_size),
+ shift_sigma=random_shift_sigma,
+ rot_sigma=random_rot_sigma,
+ scale_sigma=random_scale_sigma,
+ align_corners=True)
+ self._transform = Transform(image_size=image_size)
+ self._randomTexture = RandomTexture(
+ random_gray_rate=random_gray_rate,
+ random_occ_rate=random_occ_rate,
+ random_blur_rate=random_blur_rate,
+ random_gamma_rate=random_gamma_rate,
+ random_nose_fusion_rate=random_nose_fusion_rate)
+
+ def process(self, img, lmk, lmk_5pts=None, scale=1.0, center_w=0, center_h=0, is_train=True):
+ if self.is_train and random.random() < self.aug_prob:
+ img, lmk, lmk_5pts, center_w, center_h = self._flip.process(img, lmk, lmk_5pts, center_w, center_h)
+ matrix_geoaug = self._randomGeometryMatrix.process()
+ if self.crop_op:
+ matrix_pre = self._cropMatrix.process(scale, center_w, center_h)
+ else:
+ matrix_pre = self._alignMatrix.process(lmk_5pts)
+ matrix = matrix_geoaug @ matrix_pre
+ aug_img, aug_lmk = self._transform.process(img, lmk, matrix)
+ aug_img = self._randomTexture.process(aug_img)
+ else:
+ if self.crop_op:
+ matrix = self._cropMatrix.process(scale, center_w, center_h)
+ else:
+ matrix = self._alignMatrix.process(lmk_5pts)
+ aug_img, aug_lmk = self._transform.process(img, lmk, matrix)
+ return aug_img, aug_lmk, matrix
+
+
+class GetCropMatrix:
+ def __init__(self, image_size, target_face_scale, align_corners=False):
+ self.image_size = image_size
+ self.target_face_scale = target_face_scale
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def process(self, scale, center_w, center_h):
+ if self.align_corners:
+ to_w, to_h = self.image_size-1, self.image_size-1
+ else:
+ to_w, to_h = self.image_size, self.image_size
+
+ rot_mu = 0
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+ shift_xy_mu = (0, 0)
+ matrix = self._compose_rotate_and_scale(
+ rot_mu, scale_mu, shift_xy_mu,
+ from_center=[center_w, center_h],
+ to_center=[to_w/2.0, to_h/2.0])
+ return matrix
+
+
+class GetAlignMatrix:
+ def __init__(self, image_size, target_face_scale, std_lmk_5pts):
+ """
+ points in std_lmk_5pts range from -1 to 1.
+ """
+ self.std_lmk_5pts = (std_lmk_5pts * target_face_scale + 1) * \
+ np.array([image_size, image_size], np.float32) / 2.0
+
+ def process(self, lmk_5pts):
+ assert lmk_5pts.shape[-2:] == (5, 2)
+ tform = transform.SimilarityTransform()
+ tform.estimate(lmk_5pts, self.std_lmk_5pts)
+ return tform.params
+
+
+class GetRandomGeometryMatrix:
+ def __init__(self, target_shape, from_shape,
+ shift_sigma=0.1, rot_sigma=18*math.pi/180, scale_sigma=0.1,
+ shift_mu=0.0, rot_mu=0.0, scale_mu=1.0,
+ shift_normal=True, rot_normal=True, scale_normal=True,
+ align_corners=False):
+ self.target_shape = target_shape
+ self.from_shape = from_shape
+ self.shift_config = (shift_mu, shift_sigma, shift_normal)
+ self.rot_config = (rot_mu, rot_sigma, rot_normal)
+ self.scale_config = (scale_mu, scale_sigma, scale_normal)
+ self.align_corners = align_corners
+
+ def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, to_center):
+ cosv = math.cos(angle)
+ sinv = math.sin(angle)
+
+ fx, fy = from_center
+ tx, ty = to_center
+
+ acos = scale * cosv
+ asin = scale * sinv
+
+ a0 = acos
+ a1 = -asin
+ a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+ b0 = asin
+ b1 = acos
+ b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+ rot_scale_m = np.array([
+ [a0, a1, a2],
+ [b0, b1, b2],
+ [0.0, 0.0, 1.0]
+ ], np.float32)
+ return rot_scale_m
+
+ def _random(self, mu_sigma_normal, size=None):
+ mu, sigma, is_normal = mu_sigma_normal
+ if is_normal:
+ return np.random.normal(mu, sigma, size=size)
+ else:
+ return np.random.uniform(low=mu-sigma, high=mu+sigma, size=size)
+
+ def process(self):
+ if self.align_corners:
+ from_w, from_h = self.from_shape[1]-1, self.from_shape[0]-1
+ to_w, to_h = self.target_shape[1]-1, self.target_shape[0]-1
+ else:
+ from_w, from_h = self.from_shape[1], self.from_shape[0]
+ to_w, to_h = self.target_shape[1], self.target_shape[0]
+
+ if self.shift_config[:2] != (0.0, 0.0) or \
+ self.rot_config[:2] != (0.0, 0.0) or \
+ self.scale_config[:2] != (1.0, 0.0):
+ shift_xy = self._random(self.shift_config, size=[2]) * \
+ min(to_h, to_w)
+ rot_angle = self._random(self.rot_config)
+ scale = self._random(self.scale_config)
+ matrix_geoaug = self._compose_rotate_and_scale(
+ rot_angle, scale, shift_xy,
+ from_center=[from_w/2.0, from_h/2.0],
+ to_center=[to_w/2.0, to_h/2.0])
+
+ return matrix_geoaug
+
+
+class Transform:
+ def __init__(self, image_size):
+ self.image_size = image_size
+
+ def _transformPoints2D(self, points, matrix):
+ """
+ points (nx2), matrix (3x3) -> points (nx2)
+ """
+ dtype = points.dtype
+
+ # nx3
+ points = np.concatenate([points, np.ones_like(points[:, [0]])], axis=1)
+ points = points @ np.transpose(matrix)
+ points = points[:, :2] / points[:, [2, 2]]
+ return points.astype(dtype)
+
+ def _transformPerspective(self, image, matrix):
+ """
+ image, matrix3x3 -> transformed_image
+ """
+ return cv2.warpPerspective(
+ image, matrix,
+ dsize=(self.image_size, self.image_size),
+ flags=cv2.INTER_LINEAR, borderValue=0)
+
+ def process(self, image, landmarks, matrix):
+ t_landmarks = self._transformPoints2D(landmarks, matrix)
+ t_image = self._transformPerspective(image, matrix)
+ return t_image, t_landmarks
+
+
+class RandomTexture:
+ def __init__(self, random_gray_rate=0, random_occ_rate=0, random_blur_rate=0, random_gamma_rate=0, random_nose_fusion_rate=0):
+ self.random_gray_rate = random_gray_rate
+ self.random_occ_rate = random_occ_rate
+ self.random_blur_rate = random_blur_rate
+ self.random_gamma_rate = random_gamma_rate
+ self.random_nose_fusion_rate = random_nose_fusion_rate
+ self.texture_augs = (
+ (self.add_occ, self.random_occ_rate),
+ (self.add_blur, self.random_blur_rate),
+ (self.add_gamma, self.random_gamma_rate),
+ (self.add_nose_fusion, self.random_nose_fusion_rate)
+ )
+
+ def add_gray(self, image):
+ assert image.ndim == 3 and image.shape[-1] == 3
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+ image = np.tile(np.expand_dims(image, -1), [1, 1, 3])
+ return image
+
+ def add_occ(self, image):
+ h, w, c = image.shape
+ rh = 0.2 + 0.6 * random.random() # [0.2, 0.8]
+ rw = rh - 0.2 + 0.4 * random.random()
+ cx = int((h - 1) * random.random())
+ cy = int((w - 1) * random.random())
+ dh = int(h / 2 * rh)
+ dw = int(w / 2 * rw)
+ x0 = max(0, cx - dw // 2)
+ y0 = max(0, cy - dh // 2)
+ x1 = min(w - 1, cx + dw // 2)
+ y1 = min(h - 1, cy + dh // 2)
+ image[y0:y1+1, x0:x1+1] = 0
+ return image
+
+ def add_blur(self, image):
+ blur_kratio = 0.05 * random.random()
+ blur_ksize = int((image.shape[0] + image.shape[1]) / 2 * blur_kratio)
+ if blur_ksize > 1:
+ image = cv2.blur(image, (blur_ksize, blur_ksize))
+ return image
+
+ def add_gamma(self, image):
+ if random.random() < 0.5:
+ gamma = 0.25 + 0.75 * random.random()
+ else:
+ gamma = 1.0 + 3.0 * random.random()
+ image = (((image / 255.0) ** gamma) * 255).astype("uint8")
+ return image
+
+ def add_nose_fusion(self, image):
+ h, w, c = image.shape
+ nose = np.array(bytearray(os.urandom(h * w * c)), dtype=image.dtype).reshape(h, w, c)
+ alpha = 0.5 * random.random()
+ image = (1 - alpha) * image + alpha * nose
+ return image.astype(np.uint8)
+
+ def process(self, image):
+ image = image.copy()
+ if random.random() < self.random_occ_rate:
+ image = self.add_occ(image)
+ if random.random() < self.random_blur_rate:
+ image = self.add_blur(image)
+ if random.random() < self.random_gamma_rate:
+ image = self.add_gamma(image)
+ if random.random() < self.random_nose_fusion_rate:
+ image = self.add_nose_fusion(image)
+ """
+ orders = list(range(len(self.texture_augs)))
+ random.shuffle(orders)
+ for order in orders:
+ if random.random() < self.texture_augs[order][1]:
+ image = self.texture_augs[order][0](image)
+ """
+
+ if random.random() < self.random_gray_rate:
+ image = self.add_gray(image)
+
+ return image
+
+
+class Flip:
+ def __init__(self, flip_mapping, random_rate):
+ self.flip_mapping = flip_mapping
+ self.random_rate = random_rate
+
+ def process(self, image, landmarks, landmarks_5pts, center_w, center_h):
+ if random.random() >= self.random_rate or self.flip_mapping is None:
+ return image, landmarks, landmarks_5pts, center_w, center_h
+
+ # COFW
+ if landmarks.shape[0] == 29:
+ flip_offset = 0
+ # 300W, WFLW
+ elif landmarks.shape[0] in (68, 98):
+ flip_offset = -1
+ else:
+ flip_offset = -1
+
+ h, w, _ = image.shape
+ #image_flip = cv2.flip(image, 1)
+ image_flip = np.fliplr(image).copy()
+ landmarks_flip = landmarks.copy()
+ for i, j in self.flip_mapping:
+ landmarks_flip[i] = landmarks[j]
+ landmarks_flip[j] = landmarks[i]
+ landmarks_flip[:, 0] = w + flip_offset - landmarks_flip[:, 0]
+ if landmarks_5pts is not None:
+ flip_mapping = ([0, 1], [3, 4])
+ landmarks_5pts_flip = landmarks_5pts.copy()
+ for i, j in flip_mapping:
+ landmarks_5pts_flip[i] = landmarks_5pts[j]
+ landmarks_5pts_flip[j] = landmarks_5pts[i]
+ landmarks_5pts_flip[:, 0] = w + flip_offset - landmarks_5pts_flip[:, 0]
+ else:
+ landmarks_5pts_flip = None
+
+ center_w = w + flip_offset - center_w
+ return image_flip, landmarks_flip, landmarks_5pts_flip, center_w, center_h
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/decoder/__init__.py b/LAM_gpro/external/landmark_detection/lib/dataset/decoder/__init__.py
new file mode 100644
index 0000000..2315040
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/decoder/__init__.py
@@ -0,0 +1,8 @@
+from .decoder_default import decoder_default
+
+def get_decoder(decoder_type='default'):
+ if decoder_type == 'default':
+ decoder = decoder_default()
+ else:
+ raise NotImplementedError
+ return decoder
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/decoder/decoder_default.py b/LAM_gpro/external/landmark_detection/lib/dataset/decoder/decoder_default.py
new file mode 100644
index 0000000..19b981e
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/decoder/decoder_default.py
@@ -0,0 +1,38 @@
+import torch
+
+
+class decoder_default:
+ def __init__(self, weight=1, use_weight_map=False):
+ self.weight = weight
+ self.use_weight_map = use_weight_map
+
+ def _make_grid(self, h, w):
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float() / (h - 1) * 2 - 1,
+ torch.arange(w).float() / (w - 1) * 2 - 1)
+ return yy, xx
+
+ def get_coords_from_heatmap(self, heatmap):
+ """
+ inputs:
+ - heatmap: batch x npoints x h x w
+
+ outputs:
+ - coords: batch x npoints x 2 (x,y), [-1, +1]
+ - radius_sq: batch x npoints
+ """
+ batch, npoints, h, w = heatmap.shape
+ if self.use_weight_map:
+ heatmap = heatmap * self.weight
+
+ yy, xx = self._make_grid(h, w)
+ yy = yy.view(1, 1, h, w).to(heatmap)
+ xx = xx.view(1, 1, h, w).to(heatmap)
+
+ heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
+
+ yy_coord = (yy * heatmap).sum([2, 3]) / heatmap_sum # batch x npoints
+ xx_coord = (xx * heatmap).sum([2, 3]) / heatmap_sum # batch x npoints
+ coords = torch.stack([xx_coord, yy_coord], dim=-1)
+
+ return coords
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/encoder/__init__.py b/LAM_gpro/external/landmark_detection/lib/dataset/encoder/__init__.py
new file mode 100644
index 0000000..b80fe99
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/encoder/__init__.py
@@ -0,0 +1,8 @@
+from .encoder_default import encoder_default
+
+def get_encoder(image_height, image_width, scale=0.25, sigma=1.5, encoder_type='default'):
+ if encoder_type == 'default':
+ encoder = encoder_default(image_height, image_width, scale, sigma)
+ else:
+ raise NotImplementedError
+ return encoder
diff --git a/LAM_gpro/external/landmark_detection/lib/dataset/encoder/encoder_default.py b/LAM_gpro/external/landmark_detection/lib/dataset/encoder/encoder_default.py
new file mode 100644
index 0000000..6662a94
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/dataset/encoder/encoder_default.py
@@ -0,0 +1,63 @@
+import copy
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+
+class encoder_default:
+ def __init__(self, image_height, image_width, scale=0.25, sigma=1.5):
+ self.image_height = image_height
+ self.image_width = image_width
+ self.scale = scale
+ self.sigma = sigma
+
+ def generate_heatmap(self, points):
+ # points = (num_pts, 2)
+ h, w = self.image_height, self.image_width
+ pointmaps = []
+ for i in range(len(points)):
+ pointmap = np.zeros([h, w], dtype=np.float32)
+ # align_corners: False.
+ point = copy.deepcopy(points[i])
+ point[0] = max(0, min(w - 1, point[0]))
+ point[1] = max(0, min(h - 1, point[1]))
+ pointmap = self._circle(pointmap, point, sigma=self.sigma)
+
+ pointmaps.append(pointmap)
+ pointmaps = np.stack(pointmaps, axis=0) / 255.0
+ pointmaps = torch.from_numpy(pointmaps).float().unsqueeze(0)
+ pointmaps = F.interpolate(pointmaps, size=(int(w * self.scale), int(h * self.scale)), mode='bilinear',
+ align_corners=False).squeeze()
+ return pointmaps
+
+ def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'):
+ # Check that any part of the gaussian is in-bounds
+ tmp_size = sigma * 3
+ ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)]
+ br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)]
+ if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1 or
+ br[0] - 1 < 0 or br[1] - 1 < 0):
+ # If not, just return the image as is
+ return img
+
+ # Generate gaussian
+ size = 2 * tmp_size + 1
+ x = np.arange(0, size, 1, np.float32)
+ y = x[:, np.newaxis]
+ x0 = y0 = size // 2
+ # The gaussian is not normalized, we want the center value to equal 1
+ if label_type == 'Gaussian':
+ g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
+ else:
+ g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5)
+
+ # Usable gaussian range
+ g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
+ g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
+ # Image range
+ img_x = max(0, ul[0]), min(br[0], img.shape[1])
+ img_y = max(0, ul[1]), min(br[1], img.shape[0])
+
+ img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+ return img
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/__init__.py b/LAM_gpro/external/landmark_detection/lib/loss/__init__.py
new file mode 100644
index 0000000..f71a33b
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/__init__.py
@@ -0,0 +1,14 @@
+from .awingLoss import AWingLoss
+from .smoothL1Loss import SmoothL1Loss
+from .wingLoss import WingLoss
+from .starLoss import STARLoss
+from .starLoss_v2 import STARLoss_v2
+
+__all__ = [
+ "AWingLoss",
+ "SmoothL1Loss",
+ "WingLoss",
+ "STARLoss",
+
+ "STARLoss_v2",
+]
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/awingLoss.py b/LAM_gpro/external/landmark_detection/lib/loss/awingLoss.py
new file mode 100644
index 0000000..a5bfc57
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/awingLoss.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AWingLoss(nn.Module):
+ def __init__(self, omega=14, theta=0.5, epsilon=1, alpha=2.1, use_weight_map=True):
+ super(AWingLoss, self).__init__()
+ self.omega = omega
+ self.theta = theta
+ self.epsilon = epsilon
+ self.alpha = alpha
+ self.use_weight_map = use_weight_map
+
+ def __repr__(self):
+ return "AWingLoss()"
+
+ def generate_weight_map(self, heatmap, k_size=3, w=10):
+ dilate = F.max_pool2d(heatmap, kernel_size=k_size, stride=1, padding=1)
+ weight_map = torch.where(dilate < 0.2, torch.zeros_like(heatmap), torch.ones_like(heatmap))
+ return w * weight_map + 1
+
+ def forward(self, output, groundtruth):
+ """
+ input: b x n x h x w
+ output: b x n x h x w => 1
+ """
+ delta = (output - groundtruth).abs()
+ A = self.omega * (1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - groundtruth))) * (self.alpha - groundtruth) * \
+ (torch.pow(self.theta / self.epsilon, self.alpha - groundtruth - 1)) * (1 / self.epsilon)
+ C = self.theta * A - self.omega * \
+ torch.log(1 + torch.pow(self.theta / self.epsilon, self.alpha - groundtruth))
+ loss = torch.where(delta < self.theta,
+ self.omega * torch.log(1 + torch.pow(delta / self.epsilon, self.alpha - groundtruth)),
+ (A * delta - C))
+ if self.use_weight_map:
+ weight = self.generate_weight_map(groundtruth)
+ loss = loss * weight
+ return loss.mean()
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/smoothL1Loss.py b/LAM_gpro/external/landmark_detection/lib/loss/smoothL1Loss.py
new file mode 100644
index 0000000..e81104d
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/smoothL1Loss.py
@@ -0,0 +1,36 @@
+import torch
+import torch.nn as nn
+
+
+class SmoothL1Loss(nn.Module):
+ def __init__(self, scale=0.01):
+ super(SmoothL1Loss, self).__init__()
+ self.scale = scale
+ self.EPSILON = 1e-10
+
+ def __repr__(self):
+ return "SmoothL1Loss()"
+
+ def forward(self, output: torch.Tensor, groundtruth: torch.Tensor, reduction='mean'):
+ """
+ input: b x n x 2
+ output: b x n x 1 => 1
+ """
+ if output.dim() == 4:
+ shape = output.shape
+ groundtruth = groundtruth.reshape(shape[0], shape[1], 1, shape[3])
+
+ delta_2 = (output - groundtruth).pow(2).sum(dim=-1, keepdim=False)
+ delta = delta_2.clamp(min=1e-6).sqrt()
+ # delta = torch.sqrt(delta_2 + self.EPSILON)
+ loss = torch.where( \
+ delta_2 < self.scale * self.scale, \
+ 0.5 / self.scale * delta_2, \
+ delta - 0.5 * self.scale)
+
+ if reduction == 'mean':
+ loss = loss.mean()
+ elif reduction == 'sum':
+ loss = loss.sum()
+
+ return loss
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/starLoss.py b/LAM_gpro/external/landmark_detection/lib/loss/starLoss.py
new file mode 100644
index 0000000..bfd4378
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/starLoss.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from .smoothL1Loss import SmoothL1Loss
+from .wingLoss import WingLoss
+
+
+def get_channel_sum(input):
+ temp = torch.sum(input, dim=3)
+ output = torch.sum(temp, dim=2)
+ return output
+
+
+def expand_two_dimensions_at_end(input, dim1, dim2):
+ input = input.unsqueeze(-1).unsqueeze(-1)
+ input = input.expand(-1, -1, dim1, dim2)
+ return input
+
+
+class STARLoss(nn.Module):
+ def __init__(self, w=1, dist='smoothl1', num_dim_image=2, EPSILON=1e-5):
+ super(STARLoss, self).__init__()
+ self.w = w
+ self.num_dim_image = num_dim_image
+ self.EPSILON = EPSILON
+ self.dist = dist
+ if self.dist == 'smoothl1':
+ self.dist_func = SmoothL1Loss()
+ elif self.dist == 'l1':
+ self.dist_func = F.l1_loss
+ elif self.dist == 'l2':
+ self.dist_func = F.mse_loss
+ elif self.dist == 'wing':
+ self.dist_func = WingLoss()
+ else:
+ raise NotImplementedError
+
+ def __repr__(self):
+ return "STARLoss()"
+
+ def _make_grid(self, h, w):
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float() / (h - 1) * 2 - 1,
+ torch.arange(w).float() / (w - 1) * 2 - 1)
+ return yy, xx
+
+ def weighted_mean(self, heatmap):
+ batch, npoints, h, w = heatmap.shape
+
+ yy, xx = self._make_grid(h, w)
+ yy = yy.view(1, 1, h, w).to(heatmap)
+ xx = xx.view(1, 1, h, w).to(heatmap)
+
+ yy_coord = (yy * heatmap).sum([2, 3]) # batch x npoints
+ xx_coord = (xx * heatmap).sum([2, 3]) # batch x npoints
+ coords = torch.stack([xx_coord, yy_coord], dim=-1)
+ return coords
+
+ def unbiased_weighted_covariance(self, htp, means, num_dim_image=2, EPSILON=1e-5):
+ batch_size, num_points, height, width = htp.shape
+
+ yv, xv = self._make_grid(height, width)
+ xv = Variable(xv)
+ yv = Variable(yv)
+
+ if htp.is_cuda:
+ xv = xv.cuda()
+ yv = yv.cuda()
+
+ xmean = means[:, :, 0]
+ xv_minus_mean = xv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(xmean, height,
+ width) # [batch_size, 68, 64, 64]
+ ymean = means[:, :, 1]
+ yv_minus_mean = yv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(ymean, height,
+ width) # [batch_size, 68, 64, 64]
+ wt_xv_minus_mean = xv_minus_mean
+ wt_yv_minus_mean = yv_minus_mean
+
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ vec_concat = torch.cat((wt_xv_minus_mean, wt_yv_minus_mean), 1) # [batch_size*68, 2, 4096]
+
+ htp_vec = htp.view(batch_size * num_points, 1, height * width)
+ htp_vec = htp_vec.expand(-1, 2, -1)
+
+ covariance = torch.bmm(htp_vec * vec_concat, vec_concat.transpose(1, 2)) # [batch_size*68, 2, 2]
+ covariance = covariance.view(batch_size, num_points, num_dim_image, num_dim_image) # [batch_size, 68, 2, 2]
+
+ V_1 = htp.sum([2, 3]) + EPSILON # [batch_size, 68]
+ V_2 = torch.pow(htp, 2).sum([2, 3]) + EPSILON # [batch_size, 68]
+
+ denominator = V_1 - (V_2 / V_1)
+ covariance = covariance / expand_two_dimensions_at_end(denominator, num_dim_image, num_dim_image)
+
+ return covariance
+
+ def ambiguity_guided_decompose(self, pts, eigenvalues, eigenvectors):
+ batch_size, npoints = pts.shape[:2]
+ rotate = torch.matmul(pts.view(batch_size, npoints, 1, 2), eigenvectors.transpose(-1, -2))
+ scale = rotate.view(batch_size, npoints, 2) / torch.sqrt(eigenvalues + self.EPSILON)
+ return scale
+
+ def eigenvalue_restriction(self, evalues, batch, npoints):
+ eigen_loss = torch.abs(evalues.view(batch * npoints, 2)).sum(-1)
+ return eigen_loss.mean()
+
+ def forward(self, heatmap, groundtruth):
+ """
+ heatmap: b x n x 64 x 64
+ groundtruth: b x n x 2
+ output: b x n x 1 => 1
+ """
+ # normalize
+ bs, npoints, h, w = heatmap.shape
+ heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
+ heatmap = heatmap / heatmap_sum.view(bs, npoints, 1, 1)
+
+ means = self.weighted_mean(heatmap) # [bs, 68, 2]
+ covars = self.unbiased_weighted_covariance(heatmap, means) # covars [bs, 68, 2, 2]
+
+ # TODO: GPU-based eigen-decomposition
+ # https://github.com/pytorch/pytorch/issues/60537
+ _covars = covars.view(bs * npoints, 2, 2).cpu()
+ evalues, evectors = _covars.symeig(eigenvectors=True) # evalues [bs * 68, 2], evectors [bs * 68, 2, 2]
+ evalues = evalues.view(bs, npoints, 2).to(heatmap)
+ evectors = evectors.view(bs, npoints, 2, 2).to(heatmap)
+
+ # STAR Loss
+ # Ambiguity-guided Decomposition
+ error = self.ambiguity_guided_decompose(groundtruth - means, evalues, evectors)
+ loss_trans = self.dist_func(torch.zeros_like(error).to(error), error)
+ # Eigenvalue Restriction
+ loss_eigen = self.eigenvalue_restriction(evalues, bs, npoints)
+ star_loss = loss_trans + self.w * loss_eigen
+
+ return star_loss
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/starLoss_v2.py b/LAM_gpro/external/landmark_detection/lib/loss/starLoss_v2.py
new file mode 100644
index 0000000..c182ff8
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/starLoss_v2.py
@@ -0,0 +1,150 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from .smoothL1Loss import SmoothL1Loss
+from .wingLoss import WingLoss
+
+
+def get_channel_sum(input):
+ temp = torch.sum(input, dim=3)
+ output = torch.sum(temp, dim=2)
+ return output
+
+
+def expand_two_dimensions_at_end(input, dim1, dim2):
+ input = input.unsqueeze(-1).unsqueeze(-1)
+ input = input.expand(-1, -1, dim1, dim2)
+ return input
+
+
+class STARLoss_v2(nn.Module):
+ def __init__(self, w=1, dist='smoothl1', num_dim_image=2, EPSILON=1e-5):
+ super(STARLoss_v2, self).__init__()
+ self.w = w
+ self.num_dim_image = num_dim_image
+ self.EPSILON = EPSILON
+ self.dist = dist
+ if self.dist == 'smoothl1':
+ self.dist_func = SmoothL1Loss()
+ elif self.dist == 'l1':
+ self.dist_func = F.l1_loss
+ elif self.dist == 'l2':
+ self.dist_func = F.mse_loss
+ elif self.dist == 'wing':
+ self.dist_func = WingLoss()
+ else:
+ raise NotImplementedError
+
+ def __repr__(self):
+ return "STARLoss()"
+
+ def _make_grid(self, h, w):
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float() / (h - 1) * 2 - 1,
+ torch.arange(w).float() / (w - 1) * 2 - 1)
+ return yy, xx
+
+ def weighted_mean(self, heatmap):
+ batch, npoints, h, w = heatmap.shape
+
+ yy, xx = self._make_grid(h, w)
+ yy = yy.view(1, 1, h, w).to(heatmap)
+ xx = xx.view(1, 1, h, w).to(heatmap)
+
+ yy_coord = (yy * heatmap).sum([2, 3]) # batch x npoints
+ xx_coord = (xx * heatmap).sum([2, 3]) # batch x npoints
+ coords = torch.stack([xx_coord, yy_coord], dim=-1)
+ return coords
+
+ def unbiased_weighted_covariance(self, htp, means, num_dim_image=2, EPSILON=1e-5):
+ batch_size, num_points, height, width = htp.shape
+
+ yv, xv = self._make_grid(height, width)
+ xv = Variable(xv)
+ yv = Variable(yv)
+
+ if htp.is_cuda:
+ xv = xv.cuda()
+ yv = yv.cuda()
+
+ xmean = means[:, :, 0]
+ xv_minus_mean = xv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(xmean, height,
+ width) # [batch_size, 68, 64, 64]
+ ymean = means[:, :, 1]
+ yv_minus_mean = yv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(ymean, height,
+ width) # [batch_size, 68, 64, 64]
+ wt_xv_minus_mean = xv_minus_mean
+ wt_yv_minus_mean = yv_minus_mean
+
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ vec_concat = torch.cat((wt_xv_minus_mean, wt_yv_minus_mean), 1) # [batch_size*68, 2, 4096]
+
+ htp_vec = htp.view(batch_size * num_points, 1, height * width)
+ htp_vec = htp_vec.expand(-1, 2, -1)
+
+ covariance = torch.bmm(htp_vec * vec_concat, vec_concat.transpose(1, 2)) # [batch_size*68, 2, 2]
+ covariance = covariance.view(batch_size, num_points, num_dim_image, num_dim_image) # [batch_size, 68, 2, 2]
+
+ V_1 = htp.sum([2, 3]) + EPSILON # [batch_size, 68]
+ V_2 = torch.pow(htp, 2).sum([2, 3]) + EPSILON # [batch_size, 68]
+
+ denominator = V_1 - (V_2 / V_1)
+ covariance = covariance / expand_two_dimensions_at_end(denominator, num_dim_image, num_dim_image)
+
+ return covariance
+
+ def ambiguity_guided_decompose(self, error, evalues, evectors):
+ bs, npoints = error.shape[:2]
+ normal_vector = evectors[:, :, 0]
+ tangent_vector = evectors[:, :, 1]
+ normal_error = torch.matmul(normal_vector.unsqueeze(-2), error.unsqueeze(-1))
+ tangent_error = torch.matmul(tangent_vector.unsqueeze(-2), error.unsqueeze(-1))
+ normal_error = normal_error.squeeze(dim=-1)
+ tangent_error = tangent_error.squeeze(dim=-1)
+ normal_dist = self.dist_func(normal_error, torch.zeros_like(normal_error).to(normal_error), reduction='none')
+ tangent_dist = self.dist_func(tangent_error, torch.zeros_like(tangent_error).to(tangent_error), reduction='none')
+ normal_dist = normal_dist.reshape(bs, npoints, 1)
+ tangent_dist = tangent_dist.reshape(bs, npoints, 1)
+ dist = torch.cat((normal_dist, tangent_dist), dim=-1)
+ scale_dist = dist / torch.sqrt(evalues + self.EPSILON)
+ scale_dist = scale_dist.sum(-1)
+ return scale_dist
+
+ def eigenvalue_restriction(self, evalues, batch, npoints):
+ eigen_loss = torch.abs(evalues.view(batch, npoints, 2)).sum(-1)
+ return eigen_loss
+
+ def forward(self, heatmap, groundtruth):
+ """
+ heatmap: b x n x 64 x 64
+ groundtruth: b x n x 2
+ output: b x n x 1 => 1
+ """
+ # normalize
+ bs, npoints, h, w = heatmap.shape
+ heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
+ heatmap = heatmap / heatmap_sum.view(bs, npoints, 1, 1)
+
+ means = self.weighted_mean(heatmap) # [bs, 68, 2]
+ covars = self.unbiased_weighted_covariance(heatmap, means) # covars [bs, 68, 2, 2]
+
+ # TODO: GPU-based eigen-decomposition
+ # https://github.com/pytorch/pytorch/issues/60537
+ _covars = covars.view(bs * npoints, 2, 2).cpu()
+ evalues, evectors = _covars.symeig(eigenvectors=True) # evalues [bs * 68, 2], evectors [bs * 68, 2, 2]
+ evalues = evalues.view(bs, npoints, 2).to(heatmap)
+ evectors = evectors.view(bs, npoints, 2, 2).to(heatmap)
+
+ # STAR Loss
+ # Ambiguity-guided Decomposition
+ loss_trans = self.ambiguity_guided_decompose(groundtruth - means, evalues, evectors)
+ # Eigenvalue Restriction
+ loss_eigen = self.eigenvalue_restriction(evalues, bs, npoints)
+ star_loss = loss_trans + self.w * loss_eigen
+
+ return star_loss.mean()
diff --git a/LAM_gpro/external/landmark_detection/lib/loss/wingLoss.py b/LAM_gpro/external/landmark_detection/lib/loss/wingLoss.py
new file mode 100644
index 0000000..578f71c
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/loss/wingLoss.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+import math
+import torch
+from torch import nn
+
+
+# torch.log and math.log is e based
+class WingLoss(nn.Module):
+ def __init__(self, omega=0.01, epsilon=2):
+ super(WingLoss, self).__init__()
+ self.omega = omega
+ self.epsilon = epsilon
+
+ def forward(self, pred, target):
+ y = target
+ y_hat = pred
+ delta_2 = (y - y_hat).pow(2).sum(dim=-1, keepdim=False)
+ # delta = delta_2.sqrt()
+ delta = delta_2.clamp(min=1e-6).sqrt()
+ C = self.omega - self.omega * math.log(1 + self.omega / self.epsilon)
+ loss = torch.where(
+ delta < self.omega,
+ self.omega * torch.log(1 + delta / self.epsilon),
+ delta - C
+ )
+ return loss.mean()
diff --git a/LAM_gpro/external/landmark_detection/lib/metric/__init__.py b/LAM_gpro/external/landmark_detection/lib/metric/__init__.py
new file mode 100644
index 0000000..e843d42
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/metric/__init__.py
@@ -0,0 +1,11 @@
+from .nme import NME
+from .accuracy import Accuracy
+from .fr_and_auc import FR_AUC
+from .params import count_parameters_in_MB
+
+__all__ = [
+ "NME",
+ "Accuracy",
+ "FR_AUC",
+ 'count_parameters_in_MB',
+]
diff --git a/LAM_gpro/external/landmark_detection/lib/metric/accuracy.py b/LAM_gpro/external/landmark_detection/lib/metric/accuracy.py
new file mode 100644
index 0000000..d007da2
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/metric/accuracy.py
@@ -0,0 +1,21 @@
+import torch
+import torch.nn.functional as F
+
+class Accuracy:
+ def __init__(self):
+ pass
+
+ def __repr__(self):
+ return "Accuracy()"
+
+ def test(self, label_pd, label_gt, ignore_label=-1):
+ correct_cnt = 0
+ total_cnt = 0
+ with torch.no_grad():
+ label_pd = F.softmax(label_pd, dim=1)
+ label_pd = torch.max(label_pd, 1)[1]
+ label_gt = label_gt.long()
+ c = (label_pd == label_gt)
+ correct_cnt = torch.sum(c).item()
+ total_cnt = c.size(0) - torch.sum(label_gt==ignore_label).item()
+ return correct_cnt, total_cnt
diff --git a/LAM_gpro/external/landmark_detection/lib/metric/fr_and_auc.py b/LAM_gpro/external/landmark_detection/lib/metric/fr_and_auc.py
new file mode 100644
index 0000000..b4ceec4
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/metric/fr_and_auc.py
@@ -0,0 +1,25 @@
+import numpy as np
+from scipy.integrate import simps
+
+
+class FR_AUC:
+ def __init__(self, data_definition):
+ self.data_definition = data_definition
+ if data_definition == '300W':
+ self.thresh = 0.05
+ else:
+ self.thresh = 0.1
+
+ def __repr__(self):
+ return "FR_AUC()"
+
+ def test(self, nmes, thres=None, step=0.0001):
+ if thres is None:
+ thres = self.thresh
+
+ num_data = len(nmes)
+ xs = np.arange(0, thres + step, step)
+ ys = np.array([np.count_nonzero(nmes <= x) for x in xs]) / float(num_data)
+ fr = 1.0 - ys[-1]
+ auc = simps(ys, x=xs) / thres
+ return [round(fr, 4), round(auc, 6)]
diff --git a/LAM_gpro/external/landmark_detection/lib/metric/nme.py b/LAM_gpro/external/landmark_detection/lib/metric/nme.py
new file mode 100644
index 0000000..2da6b07
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/metric/nme.py
@@ -0,0 +1,39 @@
+import torch
+import numpy as np
+
+class NME:
+ def __init__(self, nme_left_index, nme_right_index):
+ self.nme_left_index = nme_left_index
+ self.nme_right_index = nme_right_index
+
+ def __repr__(self):
+ return "NME()"
+
+ def get_norm_distance(self, landmarks):
+ assert isinstance(self.nme_right_index, list), 'the nme_right_index is not list.'
+ assert isinstance(self.nme_left_index, list), 'the nme_left, index is not list.'
+ right_pupil = landmarks[self.nme_right_index, :].mean(0)
+ left_pupil = landmarks[self.nme_left_index, :].mean(0)
+ norm_distance = np.linalg.norm(right_pupil - left_pupil)
+ return norm_distance
+
+ def test(self, label_pd, label_gt):
+ nme_list = []
+ label_pd = label_pd.data.cpu().numpy()
+ label_gt = label_gt.data.cpu().numpy()
+
+ for i in range(label_gt.shape[0]):
+ landmarks_gt = label_gt[i]
+ landmarks_pv = label_pd[i]
+ if isinstance(self.nme_right_index, list):
+ norm_distance = self.get_norm_distance(landmarks_gt)
+ elif isinstance(self.nme_right_index, int):
+ norm_distance = np.linalg.norm(landmarks_gt[self.nme_left_index] - landmarks_gt[self.nme_right_index])
+ else:
+ raise NotImplementedError
+ landmarks_delta = landmarks_pv - landmarks_gt
+ nme = (np.linalg.norm(landmarks_delta, axis=1) / norm_distance).mean()
+ nme_list.append(nme)
+ # sum_nme += nme
+ # total_cnt += 1
+ return nme_list
diff --git a/LAM_gpro/external/landmark_detection/lib/metric/params.py b/LAM_gpro/external/landmark_detection/lib/metric/params.py
new file mode 100644
index 0000000..7b55520
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/metric/params.py
@@ -0,0 +1,7 @@
+import torch.nn as nn
+
+def count_parameters_in_MB(model):
+ if isinstance(model, nn.Module):
+ return sum(v.numel() for v in model.parameters()) / 1e6
+ else:
+ return sum(v.numel() for v in model) / 1e6
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/lib/utility.py b/LAM_gpro/external/landmark_detection/lib/utility.py
new file mode 100644
index 0000000..28f5ae7
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utility.py
@@ -0,0 +1,362 @@
+import json
+import os.path as osp
+import time
+import torch
+import numpy as np
+from tqdm import tqdm
+
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader, DistributedSampler
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import torch.nn.functional as F
+
+# private package
+from external.landmark_detection.conf import *
+from external.landmark_detection.lib.dataset import AlignmentDataset
+from external.landmark_detection.lib.backbone import StackedHGNetV1
+from external.landmark_detection.lib.loss import *
+from external.landmark_detection.lib.metric import NME, FR_AUC
+from external.landmark_detection.lib.utils import convert_secs2time
+from external.landmark_detection.lib.utils import AverageMeter
+
+
+def get_config(args):
+ config = None
+ config_name = args.config_name
+ config = Alignment(args)
+
+
+ return config
+
+
+def get_dataset(config, tsv_file, image_dir, loader_type, is_train):
+ dataset = None
+ if loader_type == "alignment":
+ dataset = AlignmentDataset(
+ tsv_file,
+ image_dir,
+ transforms.Compose([transforms.ToTensor()]),
+ config.width,
+ config.height,
+ config.channels,
+ config.means,
+ config.scale,
+ config.classes_num,
+ config.crop_op,
+ config.aug_prob,
+ config.edge_info,
+ config.flip_mapping,
+ is_train,
+ encoder_type=config.encoder_type
+ )
+ else:
+ assert False
+ return dataset
+
+
+def get_dataloader(config, data_type, world_rank=0, world_size=1):
+ loader = None
+ if data_type == "train":
+ dataset = get_dataset(
+ config,
+ config.train_tsv_file,
+ config.train_pic_dir,
+ config.loader_type,
+ is_train=True)
+ if world_size > 1:
+ sampler = DistributedSampler(dataset, rank=world_rank, num_replicas=world_size, shuffle=True)
+ loader = DataLoader(dataset, sampler=sampler, batch_size=config.batch_size // world_size,
+ num_workers=config.train_num_workers, pin_memory=True, drop_last=True)
+ else:
+ loader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True,
+ num_workers=config.train_num_workers)
+ elif data_type == "val":
+ dataset = get_dataset(
+ config,
+ config.val_tsv_file,
+ config.val_pic_dir,
+ config.loader_type,
+ is_train=False)
+ loader = DataLoader(dataset, shuffle=False, batch_size=config.val_batch_size,
+ num_workers=config.val_num_workers)
+ elif data_type == "test":
+ dataset = get_dataset(
+ config,
+ config.test_tsv_file,
+ config.test_pic_dir,
+ config.loader_type,
+ is_train=False)
+ loader = DataLoader(dataset, shuffle=False, batch_size=config.test_batch_size,
+ num_workers=config.test_num_workers)
+ else:
+ assert False
+ return loader
+
+
+def get_optimizer(config, net):
+ params = net.parameters()
+
+ optimizer = None
+ if config.optimizer == "sgd":
+ optimizer = optim.SGD(
+ params,
+ lr=config.learn_rate,
+ momentum=config.momentum,
+ weight_decay=config.weight_decay,
+ nesterov=config.nesterov)
+ elif config.optimizer == "adam":
+ optimizer = optim.Adam(
+ params,
+ lr=config.learn_rate)
+ elif config.optimizer == "rmsprop":
+ optimizer = optim.RMSprop(
+ params,
+ lr=config.learn_rate,
+ momentum=config.momentum,
+ alpha=config.alpha,
+ eps=config.epsilon,
+ weight_decay=config.weight_decay
+ )
+ else:
+ assert False
+ return optimizer
+
+
+def get_scheduler(config, optimizer):
+ if config.scheduler == "MultiStepLR":
+ scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=config.milestones, gamma=config.gamma)
+ else:
+ assert False
+ return scheduler
+
+
+def get_net(config):
+ net = None
+ if config.net == "stackedHGnet_v1":
+ net = StackedHGNetV1(config=config,
+ classes_num=config.classes_num,
+ edge_info=config.edge_info,
+ nstack=config.nstack,
+ add_coord=config.add_coord,
+ decoder_type=config.decoder_type)
+ else:
+ assert False
+ return net
+
+
+def get_criterions(config):
+ criterions = list()
+ for k in range(config.label_num):
+ if config.criterions[k] == "AWingLoss":
+ criterion = AWingLoss()
+ elif config.criterions[k] == "smoothl1":
+ criterion = SmoothL1Loss()
+ elif config.criterions[k] == "l1":
+ criterion = F.l1_loss
+ elif config.criterions[k] == 'l2':
+ criterion = F.mse_loss
+ elif config.criterions[k] == "STARLoss":
+ criterion = STARLoss(dist=config.star_dist, w=config.star_w)
+ elif config.criterions[k] == "STARLoss_v2":
+ criterion = STARLoss_v2(dist=config.star_dist, w=config.star_w)
+ else:
+ assert False
+ criterions.append(criterion)
+ return criterions
+
+
+def set_environment(config):
+ if config.device_id >= 0:
+ assert torch.cuda.is_available() and torch.cuda.device_count() > config.device_id
+ torch.cuda.empty_cache()
+ config.device = torch.device("cuda", config.device_id)
+ config.use_gpu = True
+ else:
+ config.device = torch.device("cpu")
+ config.use_gpu = False
+
+ torch.set_default_dtype(torch.float32)
+ torch.set_default_tensor_type(torch.FloatTensor)
+ torch.set_flush_denormal(True) # ignore extremely small value
+ torch.backends.cudnn.benchmark = True # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
+ torch.autograd.set_detect_anomaly(True)
+
+
+def forward(config, test_loader, net):
+ # ave_metrics = [[0, 0] for i in range(config.label_num)]
+ list_nmes = [[] for i in range(config.label_num)]
+ metric_nme = NME(nme_left_index=config.nme_left_index, nme_right_index=config.nme_right_index)
+ metric_fr_auc = FR_AUC(data_definition=config.data_definition)
+
+ output_pd = None
+
+ net = net.float().to(config.device)
+ net.eval()
+ dataset_size = len(test_loader.dataset)
+ batch_size = test_loader.batch_size
+ if config.logger is not None:
+ config.logger.info("Forward process, Dataset size: %d, Batch size: %d" % (dataset_size, batch_size))
+ for i, sample in enumerate(tqdm(test_loader)):
+ input = sample["data"].float().to(config.device, non_blocking=True)
+ labels = list()
+ if isinstance(sample["label"], list):
+ for label in sample["label"]:
+ label = label.float().to(config.device, non_blocking=True)
+ labels.append(label)
+ else:
+ label = sample["label"].float().to(config.device, non_blocking=True)
+ for k in range(label.shape[1]):
+ labels.append(label[:, k])
+ labels = config.nstack * labels
+
+ with torch.no_grad():
+ output, heatmap, landmarks = net(input)
+
+ # metrics
+ for k in range(config.label_num):
+ if config.metrics[k] is not None:
+ list_nmes[k] += metric_nme.test(output[k], labels[k])
+
+ metrics = [[np.mean(nmes), ] + metric_fr_auc.test(nmes) for nmes in list_nmes]
+
+ return output_pd, metrics
+
+
+def compute_loss(config, criterions, output, labels, heatmap=None, landmarks=None):
+ batch_weight = 1.0
+ sum_loss = 0
+ losses = list()
+ for k in range(config.label_num):
+ if config.criterions[k] in ['smoothl1', 'l1', 'l2', 'WingLoss', 'AWingLoss']:
+ loss = criterions[k](output[k], labels[k])
+ elif config.criterions[k] in ["STARLoss", "STARLoss_v2"]:
+ _k = int(k / 3) if config.use_AAM else k
+ loss = criterions[k](heatmap[_k], labels[k])
+ else:
+ assert NotImplementedError
+ loss = batch_weight * loss
+ sum_loss += config.loss_weights[k] * loss
+ loss = float(loss.data.cpu().item())
+ losses.append(loss)
+ return losses, sum_loss
+
+
+def forward_backward(config, train_loader, net_module, net, net_ema, criterions, optimizer, epoch):
+ train_model_time = AverageMeter()
+ ave_losses = [0] * config.label_num
+
+ net_module = net_module.float().to(config.device)
+ net_module.train(True)
+ dataset_size = len(train_loader.dataset)
+ batch_size = config.batch_size # train_loader.batch_size
+ batch_num = max(dataset_size / max(batch_size, 1), 1)
+ if config.logger is not None:
+ config.logger.info(config.note)
+ config.logger.info("Forward Backward process, Dataset size: %d, Batch size: %d" % (dataset_size, batch_size))
+
+ iter_num = len(train_loader)
+ epoch_start_time = time.time()
+ if net_module != net:
+ train_loader.sampler.set_epoch(epoch)
+ for iter, sample in enumerate(train_loader):
+ iter_start_time = time.time()
+ # input
+ input = sample["data"].float().to(config.device, non_blocking=True)
+ # labels
+ labels = list()
+ if isinstance(sample["label"], list):
+ for label in sample["label"]:
+ label = label.float().to(config.device, non_blocking=True)
+ labels.append(label)
+ else:
+ label = sample["label"].float().to(config.device, non_blocking=True)
+ for k in range(label.shape[1]):
+ labels.append(label[:, k])
+ labels = config.nstack * labels
+ # forward
+ output, heatmaps, landmarks = net_module(input)
+
+ # loss
+ losses, sum_loss = compute_loss(config, criterions, output, labels, heatmaps, landmarks)
+ ave_losses = list(map(sum, zip(ave_losses, losses)))
+
+ # backward
+ optimizer.zero_grad()
+ with torch.autograd.detect_anomaly():
+ sum_loss.backward()
+ # torch.nn.utils.clip_grad_norm_(net_module.parameters(), 128.0)
+ optimizer.step()
+
+ if net_ema is not None:
+ accumulate_net(net_ema, net, 0.5 ** (config.batch_size / 10000.0))
+ # accumulate_net(net_ema, net, 0.5 ** (8 / 10000.0))
+
+ # output
+ train_model_time.update(time.time() - iter_start_time)
+ last_time = convert_secs2time(train_model_time.avg * (iter_num - iter - 1), True)
+ if iter % config.display_iteration == 0 or iter + 1 == len(train_loader):
+ if config.logger is not None:
+ losses_str = ' Average Loss: {:.6f}'.format(sum(losses) / len(losses))
+ for k, loss in enumerate(losses):
+ losses_str += ', L{}: {:.3f}'.format(k, loss)
+ config.logger.info(
+ ' -->>[{:03d}/{:03d}][{:03d}/{:03d}]'.format(epoch, config.max_epoch, iter, iter_num) \
+ + last_time + losses_str)
+
+ epoch_end_time = time.time()
+ epoch_total_time = epoch_end_time - epoch_start_time
+ epoch_load_data_time = epoch_total_time - train_model_time.sum
+ if config.logger is not None:
+ config.logger.info("Train/Epoch: %d/%d, Average total time cost per iteration in this epoch: %.6f" % (
+ epoch, config.max_epoch, epoch_total_time / iter_num))
+ config.logger.info("Train/Epoch: %d/%d, Average loading data time cost per iteration in this epoch: %.6f" % (
+ epoch, config.max_epoch, epoch_load_data_time / iter_num))
+ config.logger.info("Train/Epoch: %d/%d, Average training model time cost per iteration in this epoch: %.6f" % (
+ epoch, config.max_epoch, train_model_time.avg))
+
+ ave_losses = [loss / iter_num for loss in ave_losses]
+ if config.logger is not None:
+ config.logger.info("Train/Epoch: %d/%d, Average Loss in this epoch: %.6f" % (
+ epoch, config.max_epoch, sum(ave_losses) / len(ave_losses)))
+ for k, ave_loss in enumerate(ave_losses):
+ if config.logger is not None:
+ config.logger.info("Train/Loss%03d in this epoch: %.6f" % (k, ave_loss))
+
+
+def accumulate_net(model1, model2, decay):
+ """
+ operation: model1 = model1 * decay + model2 * (1 - decay)
+ """
+ par1 = dict(model1.named_parameters())
+ par2 = dict(model2.named_parameters())
+ for k in par1.keys():
+ par1[k].data.mul_(decay).add_(
+ other=par2[k].data.to(par1[k].data.device),
+ alpha=1 - decay)
+
+ par1 = dict(model1.named_buffers())
+ par2 = dict(model2.named_buffers())
+ for k in par1.keys():
+ if par1[k].data.is_floating_point():
+ par1[k].data.mul_(decay).add_(
+ other=par2[k].data.to(par1[k].data.device),
+ alpha=1 - decay)
+ else:
+ par1[k].data = par2[k].data.to(par1[k].data.device)
+
+
+def save_model(config, epoch, net, net_ema, optimizer, scheduler, pytorch_model_path):
+ # save pytorch model
+ state = {
+ "net": net.state_dict(),
+ "optimizer": optimizer.state_dict(),
+ "scheduler": scheduler.state_dict(),
+ "epoch": epoch
+ }
+ if config.ema:
+ state["net_ema"] = net_ema.state_dict()
+
+ torch.save(state, pytorch_model_path)
+ if config.logger is not None:
+ config.logger.info("Epoch: %d/%d, model saved in this epoch" % (epoch, config.max_epoch))
diff --git a/LAM_gpro/external/landmark_detection/lib/utils/__init__.py b/LAM_gpro/external/landmark_detection/lib/utils/__init__.py
new file mode 100644
index 0000000..8cf0cbd
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utils/__init__.py
@@ -0,0 +1,16 @@
+from .meter import AverageMeter
+from .time_utils import time_print, time_string, time_string_short, time_for_file
+from .time_utils import convert_secs2time, convert_size2str
+from .vis_utils import plot_points
+
+__all__ = [
+ "AverageMeter",
+ "time_print",
+ "time_string",
+ "time_string_short",
+ "time_for_file",
+ "convert_size2str",
+ "convert_secs2time",
+
+ "plot_points",
+]
diff --git a/LAM_gpro/external/landmark_detection/lib/utils/dist_utils.py b/LAM_gpro/external/landmark_detection/lib/utils/dist_utils.py
new file mode 100644
index 0000000..ed54cab
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utils/dist_utils.py
@@ -0,0 +1,183 @@
+import torch
+from torch.autograd import Variable
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def get_channel_sum(input):
+ """
+ Generates the sum of each channel of the input
+ input = batch_size x 68 x 64 x 64
+ output = batch_size x 68
+ """
+ temp = torch.sum(input, dim=3)
+ output = torch.sum(temp, dim=2)
+
+ return output
+
+
+def expand_two_dimensions_at_end(input, dim1, dim2):
+ """
+ Adds two more dimensions to the end of the input
+ input = batch_size x 68
+ output= batch_size x 68 x dim1 x dim2
+ """
+ input = input.unsqueeze(-1).unsqueeze(-1)
+ input = input.expand(-1, -1, dim1, dim2)
+
+ return input
+
+
+class Distribution(object):
+ def __init__(self, heatmaps, num_dim_dist=2, EPSILON=1e-5, is_normalize=True):
+ self.heatmaps = heatmaps
+ self.num_dim_dist = num_dim_dist
+ self.EPSILON = EPSILON
+ self.is_normalize = is_normalize
+ batch, npoints, h, w = heatmaps.shape
+ # normalize
+ heatmap_sum = torch.clamp(heatmaps.sum([2, 3]), min=1e-6)
+ self.heatmaps = heatmaps / heatmap_sum.view(batch, npoints, 1, 1)
+
+ # means [batch_size x 68 x 2]
+ self.mean = self.get_spatial_mean(self.heatmaps)
+ # covars [batch_size x 68 x 2 x 2]
+ self.covars = self.get_covariance_matrix(self.heatmaps, self.mean)
+
+ _covars = self.covars.view(batch * npoints, 2, 2).cpu()
+ evalues, evectors = _covars.symeig(eigenvectors=True)
+ # eigenvalues [batch_size x 68 x 2]
+ self.evalues = evalues.view(batch, npoints, 2).to(heatmaps)
+ # eignvectors [batch_size x 68 x 2 x 2]
+ self.evectors = evectors.view(batch, npoints, 2, 2).to(heatmaps)
+
+ def __repr__(self):
+ return "Distribution()"
+
+ def plot(self, heatmap, mean, evalues, evectors):
+ # heatmap is not normalized
+ plt.figure(0)
+ if heatmap.is_cuda:
+ heatmap, mean = heatmap.cpu(), mean.cpu()
+ evalues, evectors = evalues.cpu(), evectors.cpu()
+ sns.heatmap(heatmap, cmap="RdBu_r")
+ for evalue, evector in zip(evalues, evectors):
+ plt.arrow(mean[0], mean[1], evalue * evector[0], evalue * evector[1],
+ width=0.2, shape="full")
+ plt.show()
+
+ def easy_plot(self, index):
+ # index = (num of batch_size, num of num_points)
+ num_bs, num_p = index
+ heatmap = self.heatmaps[num_bs, num_p]
+ mean = self.mean[num_bs, num_p]
+ evalues = self.evalues[num_bs, num_p]
+ evectors = self.evectors[num_bs, num_p]
+ self.plot(heatmap, mean, evalues, evectors)
+
+ def project_and_scale(self, pts, eigenvalues, eigenvectors):
+ batch_size, npoints, _ = pts.shape
+ proj_pts = torch.matmul(pts.view(batch_size, npoints, 1, 2), eigenvectors)
+ scale_proj_pts = proj_pts.view(batch_size, npoints, 2) / torch.sqrt(eigenvalues)
+ return scale_proj_pts
+
+ def _make_grid(self, h, w):
+ if self.is_normalize:
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float() / (h - 1) * 2 - 1,
+ torch.arange(w).float() / (w - 1) * 2 - 1)
+ else:
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float(),
+ torch.arange(w).float()
+ )
+
+ return yy, xx
+
+ def get_spatial_mean(self, heatmap):
+ batch, npoints, h, w = heatmap.shape
+
+ yy, xx = self._make_grid(h, w)
+ yy = yy.view(1, 1, h, w).to(heatmap)
+ xx = xx.view(1, 1, h, w).to(heatmap)
+
+ yy_coord = (yy * heatmap).sum([2, 3]) # batch x npoints
+ xx_coord = (xx * heatmap).sum([2, 3]) # batch x npoints
+ coords = torch.stack([xx_coord, yy_coord], dim=-1)
+ return coords
+
+ def get_covariance_matrix(self, htp, means):
+ """
+ Covariance calculation from the normalized heatmaps
+ Reference https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_covariance
+ The unbiased estimate is given by
+ Unbiased covariance =
+ ___
+ \
+ /__ w_i (x_i - \mu_i)^T (x_i - \mu_i)
+
+ ___________________________________________
+
+ V_1 - (V_2/V_1)
+
+ ___ ___
+ \ \
+ where V_1 = /__ w_i and V_2 = /__ w_i^2
+
+
+ Input:
+ htp = batch_size x 68 x 64 x 64
+ means = batch_size x 68 x 2
+
+ Output:
+ covariance = batch_size x 68 x 2 x 2
+ """
+ batch_size = htp.shape[0]
+ num_points = htp.shape[1]
+ height = htp.shape[2]
+ width = htp.shape[3]
+
+ yv, xv = self._make_grid(height, width)
+ xv = Variable(xv)
+ yv = Variable(yv)
+
+ if htp.is_cuda:
+ xv = xv.cuda()
+ yv = yv.cuda()
+
+ xmean = means[:, :, 0]
+ xv_minus_mean = xv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(xmean, height,
+ width) # batch_size x 68 x 64 x 64
+ ymean = means[:, :, 1]
+ yv_minus_mean = yv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(ymean, height,
+ width) # batch_size x 68 x 64 x 64
+
+ # These are the unweighted versions
+ wt_xv_minus_mean = xv_minus_mean
+ wt_yv_minus_mean = yv_minus_mean
+
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, height * width) # batch_size*68 x 4096
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, 1,
+ height * width) # batch_size*68 x 1 x 4096
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, height * width) # batch_size*68 x 4096
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, 1,
+ height * width) # batch_size*68 x 1 x 4096
+ vec_concat = torch.cat((wt_xv_minus_mean, wt_yv_minus_mean), 1) # batch_size*68 x 2 x 4096
+
+ htp_vec = htp.view(batch_size * num_points, 1, height * width)
+ htp_vec = htp_vec.expand(-1, 2, -1)
+
+ # Torch batch matrix multiplication
+ # https://pytorch.org/docs/stable/torch.html#torch.bmm
+ # Also use the heatmap as the weights at one place now
+ covariance = torch.bmm(htp_vec * vec_concat, vec_concat.transpose(1, 2)) # batch_size*68 x 2 x 2
+ covariance = covariance.view(batch_size, num_points, self.num_dim_dist,
+ self.num_dim_dist) # batch_size x 68 x 2 x 2
+
+ V_1 = get_channel_sum(htp) + self.EPSILON # batch_size x 68
+ V_2 = get_channel_sum(torch.pow(htp, 2)) # batch_size x 68
+ denominator = V_1 - (V_2 / V_1)
+
+ covariance = covariance / expand_two_dimensions_at_end(denominator, self.num_dim_dist, self.num_dim_dist)
+
+ return (covariance)
diff --git a/LAM_gpro/external/landmark_detection/lib/utils/meter.py b/LAM_gpro/external/landmark_detection/lib/utils/meter.py
new file mode 100644
index 0000000..4ba5f27
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utils/meter.py
@@ -0,0 +1,20 @@
+class AverageMeter(object):
+ """Computes and stores the average and current value"""
+
+ def __init__(self):
+ self.reset()
+
+ def reset(self):
+ self.val = 0.0
+ self.avg = 0.0
+ self.sum = 0.0
+ self.count = 0.0
+
+ def update(self, val, n=1):
+ self.val = val
+ self.sum += val
+ self.count += n
+ self.avg = self.sum / self.count
+
+ def __repr__(self):
+ return ('{name}(val={val}, avg={avg}, count={count})'.format(name=self.__class__.__name__, **self.__dict__))
\ No newline at end of file
diff --git a/LAM_gpro/external/landmark_detection/lib/utils/time_utils.py b/LAM_gpro/external/landmark_detection/lib/utils/time_utils.py
new file mode 100644
index 0000000..d177aaf
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utils/time_utils.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import time, sys
+import numpy as np
+
+
+def time_for_file():
+ ISOTIMEFORMAT = '%d-%h-at-%H-%M-%S'
+ return '{}'.format(time.strftime(ISOTIMEFORMAT, time.gmtime(time.time())))
+
+
+def time_string():
+ ISOTIMEFORMAT = '%Y-%m-%d %X'
+ string = '[{}]'.format(time.strftime(ISOTIMEFORMAT, time.gmtime(time.time())))
+ return string
+
+
+def time_string_short():
+ ISOTIMEFORMAT = '%Y%m%d'
+ string = '{}'.format(time.strftime(ISOTIMEFORMAT, time.gmtime(time.time())))
+ return string
+
+
+def time_print(string, is_print=True):
+ if (is_print):
+ print('{} : {}'.format(time_string(), string))
+
+
+def convert_size2str(torch_size):
+ dims = len(torch_size)
+ string = '['
+ for idim in range(dims):
+ string = string + ' {}'.format(torch_size[idim])
+ return string + ']'
+
+
+def convert_secs2time(epoch_time, return_str=False):
+ need_hour = int(epoch_time / 3600)
+ need_mins = int((epoch_time - 3600 * need_hour) / 60)
+ need_secs = int(epoch_time - 3600 * need_hour - 60 * need_mins)
+ if return_str:
+ str = '[Time Left: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs)
+ return str
+ else:
+ return need_hour, need_mins, need_secs
diff --git a/LAM_gpro/external/landmark_detection/lib/utils/vis_utils.py b/LAM_gpro/external/landmark_detection/lib/utils/vis_utils.py
new file mode 100644
index 0000000..99b5ed1
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/lib/utils/vis_utils.py
@@ -0,0 +1,31 @@
+import cv2
+import numpy as np
+import numbers
+
+
+def plot_points(vis, points, radius=1, color=(255, 255, 0), shift=4, indexes=0, is_index=False):
+ if isinstance(points, list):
+ num_point = len(points)
+ elif isinstance(points, np.numarray):
+ num_point = points.shape[0]
+ else:
+ raise NotImplementedError
+ if isinstance(radius, numbers.Number):
+ radius = np.zeros((num_point)) + radius
+
+ if isinstance(indexes, numbers.Number):
+ indexes = [indexes + i for i in range(num_point)]
+ elif isinstance(indexes, list):
+ pass
+ else:
+ raise NotImplementedError
+
+ factor = (1 << shift)
+ for (index, p, s) in zip(indexes, points, radius):
+ cv2.circle(vis, (int(p[0] * factor + 0.5), int(p[1] * factor + 0.5)),
+ int(s * factor), color, 1, cv2.LINE_AA, shift=shift)
+ if is_index:
+ vis = cv2.putText(vis, str(index), (int(p[0]), int(p[1])), cv2.FONT_HERSHEY_SIMPLEX, 0.2,
+ (255, 255, 255), 1)
+
+ return vis
diff --git a/LAM_gpro/external/landmark_detection/requirements.txt b/LAM_gpro/external/landmark_detection/requirements.txt
new file mode 100644
index 0000000..2e61114
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/requirements.txt
@@ -0,0 +1,19 @@
+tqdm
+torch==1.6.0
+torchvision==0.7.0
+python-gflags==3.1.2
+pandas==0.24.2
+pillow==6.0.0
+numpy==1.16.4
+opencv-python==4.1.0.25
+imageio==2.5.0
+imgaug==0.2.9
+lmdb==0.98
+lxml==4.5.0
+tensorboard==2.4.1
+protobuf==3.20
+tensorboardX==1.8
+# pyarrow==0.17.1
+# wandb==0.10.25
+# https://pytorch.org/get-started/previous-versions/
+# pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
diff --git a/LAM_gpro/external/landmark_detection/tester.py b/LAM_gpro/external/landmark_detection/tester.py
new file mode 100644
index 0000000..2b79b2c
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tester.py
@@ -0,0 +1,49 @@
+import os
+import torch
+from lib import utility
+
+
+def test(args):
+ # conf
+ config = utility.get_config(args)
+ config.device_id = args.device_ids[0]
+
+ # set environment
+ utility.set_environment(config)
+ config.init_instance()
+ if config.logger is not None:
+ config.logger.info("Loaded configure file %s: %s" % (args.config_name, config.id))
+ config.logger.info("\n" + "\n".join(["%s: %s" % item for item in config.__dict__.items()]))
+
+ # model
+ net = utility.get_net(config)
+ model_path = os.path.join(config.model_dir,
+ "train.pkl") if args.pretrained_weight is None else args.pretrained_weight
+ if args.device_ids == [-1]:
+ checkpoint = torch.load(model_path, map_location="cpu")
+ else:
+ checkpoint = torch.load(model_path)
+
+ net.load_state_dict(checkpoint["net"])
+
+ if config.logger is not None:
+ config.logger.info("Loaded network")
+ # config.logger.info('Net flops: {} G, params: {} MB'.format(flops/1e9, params/1e6))
+
+ # data - test
+ test_loader = utility.get_dataloader(config, "test")
+
+ if config.logger is not None:
+ config.logger.info("Loaded data from {:}".format(config.test_tsv_file))
+
+ # inference
+ result, metrics = utility.forward(config, test_loader, net)
+ if config.logger is not None:
+ config.logger.info("Finished inference")
+
+ # output
+ for k, metric in enumerate(metrics):
+ if config.logger is not None and len(metric) != 0:
+ config.logger.info(
+ "Tested {} dataset, the Size is {}, Metric: [NME {:.6f}, FR {:.6f}, AUC {:.6f}]".format(
+ config.type, len(test_loader.dataset), metric[0], metric[1], metric[2]))
diff --git a/LAM_gpro/external/landmark_detection/tools/analysis_motivation.py b/LAM_gpro/external/landmark_detection/tools/analysis_motivation.py
new file mode 100644
index 0000000..bbcbdd3
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tools/analysis_motivation.py
@@ -0,0 +1,220 @@
+import glob
+import json
+import os.path as osp
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pandas import DataFrame
+import pandas as pd
+
+
+def L2(p1, p2):
+ return np.linalg.norm(p1 - p2)
+
+
+def NME(landmarks_gt, landmarks_pv):
+ pts_num = landmarks_gt.shape[0]
+ if pts_num == 29:
+ left_index = 16
+ right_index = 17
+ elif pts_num == 68:
+ left_index = 36
+ right_index = 45
+ elif pts_num == 98:
+ left_index = 60
+ right_index = 72
+
+ nme = 0
+ eye_span = L2(landmarks_gt[left_index], landmarks_gt[right_index])
+ nmeList = []
+ for i in range(pts_num):
+ error = L2(landmarks_pv[i], landmarks_gt[i])
+ _nme = error / eye_span
+ nmeList.append(_nme)
+ nme += _nme
+ nme /= pts_num
+ return nme, nmeList
+
+
+def NME_analysis(listA):
+ for jsonA in listA:
+ pred = np.array(jsonA['pred'])
+ gt = np.array(jsonA['gt'])
+ nme, nmeList = NME(gt, pred)
+ jsonA['nme'] = nme
+ jsonA['nmeList'] = nmeList
+ return listA
+
+
+def nme_analysis(listA):
+ bdy_nmeList = []
+ scene_nmeList = []
+ for jsonA in tqdm(listA):
+ nme = jsonA['nmeList']
+ nme = np.array(nme)
+ bdy_nme = np.mean(nme[:33])
+ scene_nme = np.mean(nme[33:])
+ # scene_nme = np.mean(nme[[33, 35, 40, 38,
+ # 60, 62, 96, 66, 64,
+ # 50, 44, 48, 46,
+ # 68, 70, 97, 74, 72,
+ # 54, 55, 57, 59,
+ # 76, 82, 79, 90, 94, 85, 16]])
+ bdy_nmeList.append(bdy_nme)
+ scene_nmeList.append(scene_nme)
+ print('bdy nme: {:.4f}'.format(np.mean(bdy_nmeList)))
+ print('scene_nmeList: {:.4f}'.format(np.mean(scene_nmeList)))
+
+
+def Energy_analysis(listA, easyThresh=0.02, easyNum=10, hardThresh=0.07, hardNum=10):
+ easyDict = {'energy': [], 'nme': []}
+ hardDict = {'energy': [], 'nme': []}
+
+ _easyNum, _hardNum = 0, 0
+
+ def cal_energy(evalues):
+ evalues = np.array(evalues)
+ # _energy = _energy.max(1)
+ eccentricity = evalues.max(1) / evalues.min(1)
+ # _energy = _energy.sum() / 2
+ _energy = np.mean(eccentricity)
+ return _energy
+
+ for jsonA in tqdm(listA):
+ nme = jsonA['nme']
+ evalues = jsonA['evalues']
+
+ if _easyNum == easyNum and _hardNum == hardNum:
+ break
+
+ if nme < easyThresh and _easyNum < easyNum:
+ energy = cal_energy(evalues)
+ easyDict['energy'].append(energy)
+ easyDict['nme'].append(nme)
+ _easyNum += 1
+ elif nme > hardThresh and _hardNum < hardNum:
+ energy = cal_energy(evalues)
+ hardDict['energy'].append(energy)
+ hardDict['nme'].append(nme)
+ _hardNum += 1
+
+ print('easyThresh: < {}; hardThresh > {}'.format(easyThresh, hardThresh))
+ print(' |nme |energy |num |')
+ print('easy samples: |{:.4f} |{:.4f} |{} |'.format(np.mean(easyDict['nme']),
+ np.mean(easyDict['energy']),
+ len(easyDict['energy'])))
+ print('hard samples: |{:.4f} |{:.4f} |{} |'.format(np.mean(hardDict['nme']),
+ np.mean(hardDict['energy']),
+ len(hardDict['energy'])))
+
+ return easyDict, hardDict
+
+
+def Eccentricity_analysis(listA):
+ eyecornerList = []
+ boundaryList = []
+ for jsonA in listA:
+ evalues = np.array(jsonA['evalues'])
+ eccentricity = evalues.max(1) / evalues.min(1)
+
+ eyecorner = np.mean(eccentricity[[60, 64, 68, 72]])
+ boundary = np.mean(eccentricity[0:33])
+ eyecornerList.append(eyecorner)
+ boundaryList.append(boundary)
+
+ print('eyecorner: {:.4f}'.format(np.mean(eyecornerList)))
+ print('boundary: {:.4f}'.format(np.mean(boundaryList)))
+ return eyecornerList, boundaryList
+
+
+def plot_bar(dataList):
+ x = list(range(98))
+ assert len(x) == len(dataList)
+ _x = 'Landmark Index'
+ # _y = 'elliptical eccentricity (λ1/λ2)'
+ _y = 'PCA Analyze (λ1/λ2)'
+ data = {
+ _x: x,
+ _y: dataList
+ }
+ df = DataFrame(data)
+ plt.figure(figsize=(10, 4))
+ sns.barplot(x=_x, y=_y, data=df)
+ plt.show()
+
+
+def Eccentricity_analysis2(listA, is_vis=False):
+ landmarksList = [[] for i in range(98)]
+ for jsonA in listA:
+ evalues = np.array(jsonA['evalues'])
+ eccentricity = evalues.max(1) / evalues.min(1)
+ for i, e in enumerate(eccentricity):
+ landmarksList[i].append(e)
+ print('Mean value: {:.4f}'.format(np.mean(np.array(landmarksList))))
+ landmarksList = [np.mean(l) for l in landmarksList]
+ if is_vis:
+ plot_bar(landmarksList)
+ return landmarksList
+
+
+def std_analysis2():
+ save_dir = '/apdcephfs/share_1134483/charlinzhou/experiment/cvpr-23/wflw_results'
+ # l2_npy = glob.glob(osp.join(save_dir, '*DSNT*.npy'))
+ l2_npy = glob.glob(osp.join(save_dir, '*MHNLoss_v2_l2*.npy'))
+
+ def npy2std(npyList):
+ datas = [np.load(npy)[np.newaxis, :] for npy in npyList]
+ datas = np.concatenate(datas, axis=0)
+ # denormalization
+ datas = (datas + 1) * 256 / 2
+ mean = datas.mean(axis=0)[np.newaxis, :]
+ dist = np.linalg.norm(datas - mean, axis=-1)
+ std = np.std(dist, 0)
+ print('min: {}, max:{}, mean:{}'.format(std.min(), std.max(), std.mean()))
+ return std
+
+ std1 = npy2std(l2_npy)
+ std1 = std1.mean(0)
+ # plot_bar(std1)
+ bdy_std = np.mean(std1[:33])
+ cofw_std = np.mean(std1[[33, 35, 40, 38,
+ 60, 62, 96, 66, 64,
+ 50, 44, 48, 46,
+ 68, 70, 97, 74, 72,
+ 54, 55, 57, 59,
+ 76, 82, 79, 90, 94, 85, 16]])
+ print('bdy_std: {:.4f}, cofw_std: {:.4f}'.format(bdy_std, cofw_std))
+ print('the ratio of Boundary std and ALL std: {:.4f} / {:.4f}'.format(np.sum(std1[:33]), np.sum(std1)))
+
+
+if __name__ == '__main__':
+ # 4.29模型
+ json_path = '/apdcephfs/share_1134483/charlinzhou/ckpts/STAR/WFLW/WFLW_256x256_adam_ep500_lr0.001_bs128_STARLoss_smoothl1_1_b0183746-161a-4b76-9cb9-8a2059090233/results.json'
+ # 无初始化
+ # json_path = '/apdcephfs/share_1134483/charlinzhou/ckpts/STAR/WFLW/WFLW_256x256_adam_ep500_lr0.001_bs128_STARLoss_smoothl1_1_9cff3656-8ca8-4c3d-a95d-da76f9f76ea5/results.json'
+ # 4.02模型
+ # json_path = '/apdcephfs/share_1134483/charlinzhou/ckpts/STAR/WFLW/WFLW_256x256_adam_ep500_lr0.001_bs128_STARLoss_smoothl1_1_AAM_2d2bb70e-6fdb-459c-baf7-18c89e7a165f/results.json'
+ listA = json.load(open(json_path, 'r'))
+ print('Load Done!')
+ listA = NME_analysis(listA)
+ print('NME analysis Done!')
+ # Exp1: 分析简单样本和困难样本的能量差异
+ easyDict, hardDict = Energy_analysis(listA, easyNum=2500, hardNum=2500, easyThresh=0.03, hardThresh=0.08)
+
+ # Exp2.1: 分析眼角点和轮廓点的斜率差异
+ # eyecornerList, boundaryList = Eccentricity_analysis(listA)
+
+ # Exp2.2: 可视化所有点的斜率分布
+ # landmarksList = Eccentricity_analysis2(listA, is_vis=True)
+
+ # Exp2.3: 可视化所有点的方差分布
+ # std_analysis2()
+
+ # Exp3: 五官和轮廓NME分析
+ # nme_analysis(listA)
+ # print(easyDict)
+ # print(hardDict)
+
+ # nmeList = [jsonA['nme'] for jsonA in listA]
+ # print(len(nmeList))
diff --git a/LAM_gpro/external/landmark_detection/tools/infinite_loop.py b/LAM_gpro/external/landmark_detection/tools/infinite_loop.py
new file mode 100644
index 0000000..510011e
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tools/infinite_loop.py
@@ -0,0 +1,4 @@
+import time
+
+while True:
+ time.sleep(1)
diff --git a/LAM_gpro/external/landmark_detection/tools/infinite_loop_gpu.py b/LAM_gpro/external/landmark_detection/tools/infinite_loop_gpu.py
new file mode 100644
index 0000000..6bfc2a5
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tools/infinite_loop_gpu.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+
+import os
+import time
+import torch
+import argparse
+
+parser = argparse.ArgumentParser(description='inf')
+parser.add_argument('--gpu', default='1', type=str, help='index of gpu to use')
+args = parser.parse_args()
+
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+
+n = 1000
+
+x = torch.zeros(4, n, n).cuda()
+rest_time = 0.0000000000001
+while True:
+ y = x * x
+ time.sleep(rest_time)
+ y1 = x * x
diff --git a/LAM_gpro/external/landmark_detection/tools/split_wflw.py b/LAM_gpro/external/landmark_detection/tools/split_wflw.py
new file mode 100644
index 0000000..0337f42
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tools/split_wflw.py
@@ -0,0 +1,38 @@
+import csv
+import os.path as osp
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+tsv_file = '/apdcephfs/share_1134483/charlinzhou/datas/ADNet/WFLW/test.tsv'
+save_folder = '/apdcephfs/share_1134483/charlinzhou/datas/ADNet/_WFLW/'
+
+save_tags = ['largepose', 'expression', 'illumination', 'makeup', 'occlusion', 'blur']
+save_tags = ['test_{}_metadata.tsv'.format(t) for t in save_tags]
+save_files = [osp.join(save_folder, t) for t in save_tags]
+save_files = [open(f, 'w', newline='') for f in save_files]
+
+landmark_num = 98
+items = pd.read_csv(tsv_file, sep="\t")
+
+items_num = len(items)
+for index in tqdm(range(items_num)):
+ image_path = items.iloc[index, 0]
+ landmarks_5pts = items.iloc[index, 1]
+ # landmarks_5pts = np.array(list(map(float, landmarks_5pts.split(","))), dtype=np.float32).reshape(5, 2)
+ landmarks_target = items.iloc[index, 2]
+ # landmarks_target = np.array(list(map(float, landmarks_target.split(","))), dtype=np.float32).reshape(landmark_num, 2)
+ scale = items.iloc[index, 3]
+ center_w, center_h = items.iloc[index, 4], items.iloc[index, 5]
+ if len(items.iloc[index]) > 6:
+ tags = np.array(list(map(lambda x: int(float(x)), items.iloc[index, 6].split(","))))
+ else:
+ tags = np.array([])
+ assert len(tags) == 6, '{} v.s. 6'.format(len(tags))
+ for k, tag in enumerate(tags):
+ if tag == 1:
+ save_file = save_files[k]
+ tsv_w = csv.writer(save_file, delimiter='\t')
+ tsv_w.writerow([image_path, landmarks_5pts, landmarks_target, scale, center_w, center_h])
+
+print('Done!')
diff --git a/LAM_gpro/external/landmark_detection/tools/testtime_pca.py b/LAM_gpro/external/landmark_detection/tools/testtime_pca.py
new file mode 100644
index 0000000..c231a96
--- /dev/null
+++ b/LAM_gpro/external/landmark_detection/tools/testtime_pca.py
@@ -0,0 +1,107 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+def get_channel_sum(input):
+ temp = torch.sum(input, dim=3)
+ output = torch.sum(temp, dim=2)
+ return output
+
+
+def expand_two_dimensions_at_end(input, dim1, dim2):
+ input = input.unsqueeze(-1).unsqueeze(-1)
+ input = input.expand(-1, -1, dim1, dim2)
+ return input
+
+
+class TestTimePCA(nn.Module):
+ def __init__(self):
+ super(TestTimePCA, self).__init__()
+
+ def _make_grid(self, h, w):
+ yy, xx = torch.meshgrid(
+ torch.arange(h).float() / (h - 1) * 2 - 1,
+ torch.arange(w).float() / (w - 1) * 2 - 1)
+ return yy, xx
+
+ def weighted_mean(self, heatmap):
+ batch, npoints, h, w = heatmap.shape
+
+ yy, xx = self._make_grid(h, w)
+ yy = yy.view(1, 1, h, w).to(heatmap)
+ xx = xx.view(1, 1, h, w).to(heatmap)
+
+ yy_coord = (yy * heatmap).sum([2, 3]) # batch x npoints
+ xx_coord = (xx * heatmap).sum([2, 3]) # batch x npoints
+ coords = torch.stack([xx_coord, yy_coord], dim=-1)
+ return coords
+
+ def unbiased_weighted_covariance(self, htp, means, num_dim_image=2, EPSILON=1e-5):
+ batch_size, num_points, height, width = htp.shape
+
+ yv, xv = self._make_grid(height, width)
+ xv = Variable(xv)
+ yv = Variable(yv)
+
+ if htp.is_cuda:
+ xv = xv.cuda()
+ yv = yv.cuda()
+
+ xmean = means[:, :, 0]
+ xv_minus_mean = xv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(xmean, height,
+ width) # [batch_size, 68, 64, 64]
+ ymean = means[:, :, 1]
+ yv_minus_mean = yv.expand(batch_size, num_points, -1, -1) - expand_two_dimensions_at_end(ymean, height,
+ width) # [batch_size, 68, 64, 64]
+ wt_xv_minus_mean = xv_minus_mean
+ wt_yv_minus_mean = yv_minus_mean
+
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_xv_minus_mean = wt_xv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, height * width) # [batch_size*68, 4096]
+ wt_yv_minus_mean = wt_yv_minus_mean.view(batch_size * num_points, 1, height * width) # [batch_size*68, 1, 4096]
+ vec_concat = torch.cat((wt_xv_minus_mean, wt_yv_minus_mean), 1) # [batch_size*68, 2, 4096]
+
+ htp_vec = htp.view(batch_size * num_points, 1, height * width)
+ htp_vec = htp_vec.expand(-1, 2, -1)
+
+ covariance = torch.bmm(htp_vec * vec_concat, vec_concat.transpose(1, 2)) # [batch_size*68, 2, 2]
+ covariance = covariance.view(batch_size, num_points, num_dim_image, num_dim_image) # [batch_size, 68, 2, 2]
+
+ V_1 = htp.sum([2, 3]) + EPSILON # [batch_size, 68]
+ V_2 = torch.pow(htp, 2).sum([2, 3]) + EPSILON # [batch_size, 68]
+
+ denominator = V_1 - (V_2 / V_1)
+ covariance = covariance / expand_two_dimensions_at_end(denominator, num_dim_image, num_dim_image)
+
+ return covariance
+
+ def forward(self, heatmap, groudtruth):
+
+ batch, npoints, h, w = heatmap.shape
+
+ heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
+ heatmap = heatmap / heatmap_sum.view(batch, npoints, 1, 1)
+
+ # means [batch_size, 68, 2]
+ means = self.weighted_mean(heatmap)
+
+ # covars [batch_size, 68, 2, 2]
+ covars = self.unbiased_weighted_covariance(heatmap, means)
+
+ # eigenvalues [batch_size * 68, 2] , eigenvectors [batch_size * 68, 2, 2]
+ covars = covars.view(batch * npoints, 2, 2).cpu()
+ evalues, evectors = covars.symeig(eigenvectors=True)
+ evalues = evalues.view(batch, npoints, 2)
+ evectors = evectors.view(batch, npoints, 2, 2)
+ means = means.cpu()
+
+ results = [dict() for _ in range(batch)]
+ for i in range(batch):
+ results[i]['pred'] = means[i].numpy().tolist()
+ results[i]['gt'] = groudtruth[i].cpu().numpy().tolist()
+ results[i]['evalues'] = evalues[i].numpy().tolist()
+ results[i]['evectors'] = evectors[i].numpy().tolist()
+
+ return results
diff --git a/LAM_gpro/external/nvdiffrast/LICENSE.txt b/LAM_gpro/external/nvdiffrast/LICENSE.txt
new file mode 100644
index 0000000..26a070a
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/LICENSE.txt
@@ -0,0 +1,97 @@
+Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
+
+
+Nvidia Source Code License (1-Way Commercial)
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+ 2.1 Copyright Grant. Subject to the terms and conditions of this
+ License, each Licensor grants to you a perpetual, worldwide,
+ non-exclusive, royalty-free, copyright license to reproduce,
+ prepare derivative works of, publicly display, publicly perform,
+ sublicense and distribute its Work and any resulting derivative
+ works in any form.
+
+3. Limitations
+
+ 3.1 Redistribution. You may reproduce or distribute the Work only
+ if (a) you do so under this License, (b) you include a complete
+ copy of this License with your distribution, and (c) you retain
+ without modification any copyright, patent, trademark, or
+ attribution notices that are present in the Work.
+
+ 3.2 Derivative Works. You may specify that additional or different
+ terms apply to the use, reproduction, and distribution of your
+ derivative works of the Work ("Your Terms") only if (a) Your Terms
+ provide that the use limitation in Section 3.3 applies to your
+ derivative works, and (b) you identify the specific derivative
+ works that are subject to Your Terms. Notwithstanding Your Terms,
+ this License (including the redistribution requirements in Section
+ 3.1) will continue to apply to the Work itself.
+
+ 3.3 Use Limitation. The Work and any derivative works thereof only
+ may be used or intended for use non-commercially. The Work or
+ derivative works thereof may be used or intended for use by Nvidia
+ or its affiliates commercially or non-commercially. As used herein,
+ "non-commercially" means for research or evaluation purposes only
+ and not for any direct or indirect monetary gain.
+
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim
+ against any Licensor (including any claim, cross-claim or
+ counterclaim in a lawsuit) to enforce any patents that you allege
+ are infringed by any Work, then your rights under this License from
+ such Licensor (including the grant in Section 2.1) will terminate
+ immediately.
+
+ 3.5 Trademarks. This License does not grant any rights to use any
+ Licensor's or its affiliates' names, logos, or trademarks, except
+ as necessary to reproduce the notices described in this License.
+
+ 3.6 Termination. If you violate any term of this License, then your
+ rights under this License (including the grant in Section 2.1) will
+ terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
diff --git a/LAM_gpro/external/nvdiffrast/README.md b/LAM_gpro/external/nvdiffrast/README.md
new file mode 100644
index 0000000..3eeb411
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/README.md
@@ -0,0 +1,42 @@
+## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
+
+
+
+**Modular Primitives for High-Performance Differentiable Rendering**
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) ☜☜ for more information.
+
+## Licenses
+
+Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+We do not currently accept outside code contributions in the form of pull requests.
+
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under
+[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+
+## Citation
+
+```
+@article{Laine2020diffrast,
+ title = {Modular Primitives for High-Performance Differentiable Rendering},
+ author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+ journal = {ACM Transactions on Graphics},
+ year = {2020},
+ volume = {39},
+ number = {6}
+}
+```
diff --git a/LAM_gpro/external/nvdiffrast/docker/10_nvidia.json b/LAM_gpro/external/nvdiffrast/docker/10_nvidia.json
new file mode 100644
index 0000000..2bfcca0
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/docker/10_nvidia.json
@@ -0,0 +1,6 @@
+{
+ "file_format_version" : "1.0.0",
+ "ICD" : {
+ "library_path" : "libEGL_nvidia.so.0"
+ }
+}
diff --git a/LAM_gpro/external/nvdiffrast/docker/Dockerfile b/LAM_gpro/external/nvdiffrast/docker/Dockerfile
new file mode 100644
index 0000000..f32d27e
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/docker/Dockerfile
@@ -0,0 +1,51 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# Note: Should also work with NVIDIA's Docker image builds such as
+#
+# nvcr.io/nvidia/pytorch:20.09-py3
+#
+# This file defaults to pytorch/pytorch as it works on slightly older
+# driver versions.
+FROM nvcr.io/nvidia/pytorch:23.03-py3
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+ pkg-config \
+ libglvnd0 \
+ libgl1 \
+ libglx0 \
+ libegl1 \
+ libgles2 \
+ libglvnd-dev \
+ libgl1-mesa-dev \
+ libegl1-mesa-dev \
+ libgles2-mesa-dev \
+ cmake \
+ curl
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# for GLEW
+ENV LD_LIBRARY_PATH /usr/lib64:$LD_LIBRARY_PATH
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics
+
+# Default pyopengl to EGL for good headless rendering support
+ENV PYOPENGL_PLATFORM egl
+
+COPY docker/10_nvidia.json /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+
+RUN pip install --upgrade pip
+RUN pip install ninja imageio imageio-ffmpeg
+
+COPY nvdiffrast /tmp/pip/nvdiffrast/
+COPY README.md setup.py /tmp/pip/
+RUN cd /tmp/pip && pip install .
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/cube.png b/LAM_gpro/external/nvdiffrast/docs/img/cube.png
new file mode 100644
index 0000000..92b63e6
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/cube.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/earth.png b/LAM_gpro/external/nvdiffrast/docs/img/earth.png
new file mode 100644
index 0000000..d30989a
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/earth.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/envphong.png b/LAM_gpro/external/nvdiffrast/docs/img/envphong.png
new file mode 100644
index 0000000..2c6f390
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/envphong.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/logo.png b/LAM_gpro/external/nvdiffrast/docs/img/logo.png
new file mode 100644
index 0000000..827d907
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/logo.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/pipe_cube.png b/LAM_gpro/external/nvdiffrast/docs/img/pipe_cube.png
new file mode 100644
index 0000000..6410c72
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/pipe_cube.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/pipe_earth.png b/LAM_gpro/external/nvdiffrast/docs/img/pipe_earth.png
new file mode 100644
index 0000000..c46ab68
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/pipe_earth.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/pipe_envphong.png b/LAM_gpro/external/nvdiffrast/docs/img/pipe_envphong.png
new file mode 100644
index 0000000..524c5c4
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/pipe_envphong.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/pose.png b/LAM_gpro/external/nvdiffrast/docs/img/pose.png
new file mode 100644
index 0000000..908c097
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/pose.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_aa.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_aa.png
new file mode 100644
index 0000000..c957e3b
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_aa.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_crop1.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_crop1.png
new file mode 100644
index 0000000..c43c699
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_crop1.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_crop2.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_crop2.png
new file mode 100644
index 0000000..e2c5a04
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_crop2.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_diff1.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_diff1.png
new file mode 100644
index 0000000..ebc65a2
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_diff1.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_diff2.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_diff2.png
new file mode 100644
index 0000000..14a7b6d
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_diff2.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_peel1.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_peel1.png
new file mode 100644
index 0000000..80970c5
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_peel1.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_peel2.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_peel2.png
new file mode 100644
index 0000000..269fa4b
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_peel2.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_st.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_st.png
new file mode 100644
index 0000000..669470f
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_st.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_tex.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_tex.png
new file mode 100644
index 0000000..8308898
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_tex.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_texture.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_texture.png
new file mode 100644
index 0000000..6309448
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_texture.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_texw.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_texw.png
new file mode 100644
index 0000000..6191c79
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_texw.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_tri.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_tri.png
new file mode 100644
index 0000000..8142279
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_tri.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/spot_uv.png b/LAM_gpro/external/nvdiffrast/docs/img/spot_uv.png
new file mode 100644
index 0000000..da2f744
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/spot_uv.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser.png
new file mode 100644
index 0000000..cca878e
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser1.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser1.png
new file mode 100644
index 0000000..defdaf8
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser1.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser2.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser2.png
new file mode 100644
index 0000000..a950a66
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser2.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser3.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser3.png
new file mode 100644
index 0000000..1345016
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser3.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser4.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser4.png
new file mode 100644
index 0000000..a0dceb8
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser4.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/teaser5.png b/LAM_gpro/external/nvdiffrast/docs/img/teaser5.png
new file mode 100644
index 0000000..439de8a
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/teaser5.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/thumb.jpg b/LAM_gpro/external/nvdiffrast/docs/img/thumb.jpg
new file mode 100644
index 0000000..aab9d25
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/thumb.jpg differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/img/tri.png b/LAM_gpro/external/nvdiffrast/docs/img/tri.png
new file mode 100644
index 0000000..45b1735
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/docs/img/tri.png differ
diff --git a/LAM_gpro/external/nvdiffrast/docs/index.html b/LAM_gpro/external/nvdiffrast/docs/index.html
new file mode 100644
index 0000000..7c04f4f
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/docs/index.html
@@ -0,0 +1,1060 @@
+
+
+
+
+ nvdiffrast
+
+
+
+
+
+
+
+
+
+
+
+
+Table of contents
+
+
+Overview
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering. It is a lower-level library compared to previous ones such as redner , SoftRas , or PyTorch3D — nvdiffrast has no built-in camera models, lighting/material models, etc. Instead, the provided operations encapsulate only the most graphics-centric steps in the modern hardware graphics pipeline: rasterization, interpolation, texturing, and antialiasing. All of these operations (and their gradients) are GPU-accelerated, either via CUDA or via the hardware graphics pipeline.
+This documentation is intended to serve as a user's guide to nvdiffrast. For detailed discussion on the design principles, implementation details, and benchmarks, please see our paper:
+
+Modular Primitives for High-Performance Differentiable Rendering Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila ACM Transactions on Graphics 39(6) (proc. SIGGRAPH Asia 2020)
+
+Paper: http://arxiv.org/abs/2011.03277 GitHub: https://github.com/NVlabs/nvdiffrast
+
+
+
+
+Examples of things we've done with nvdiffrast
+
+
+
+Installation
+Minimum requirements:
+
+Linux or Windows operating system.
+64-bit Python 3.6.
+PyTorch (recommended) 1.6 or TensorFlow 1.14. TensorFlow 2.x is currently not supported.
+A high-end NVIDIA GPU, NVIDIA drivers, CUDA 10.2 toolkit.
+
+To download nvdiffrast, either download the repository at https://github.com/NVlabs/nvdiffrast as a .zip file, or clone the repository using git:
+git clone https://github.com/NVlabs/nvdiffrast
+Linux
+We recommend running nvdiffrast on Docker . To build a Docker image with nvdiffrast and PyTorch 1.6 installed, run:
+./run_sample.sh --build-container
+We recommend using Ubuntu, as some Linux distributions might not have all the required packages available. Installation on CentOS is reportedly problematic, but success has been claimed here .
+To try out some of the provided code examples, run:
+./run_sample.sh ./samples/torch/cube.py --resolution 32
+Alternatively, if you have all the dependencies taken care of (consult the included Dockerfile for reference), you can install nvdiffrast in your local Python site-packages by running
+
+at the root of the repository. You can also just add the repository root directory to your PYTHONPATH.
+Windows
+On Windows, nvdiffrast requires an external compiler for compiling the CUDA kernels. The development was done using Microsoft Visual Studio 2017 Professional Edition, and this version works with both PyTorch and TensorFlow versions of nvdiffrast. VS 2019 Professional Edition has also been confirmed to work with the PyTorch version of nvdiffrast. Other VS editions besides Professional Edition, including the Community Edition, should work but have not been tested.
+If the compiler binary (cl.exe) cannot be found in PATH, nvdiffrast will search for it heuristically. If this fails you may need to add it manually via
+"C:\Program Files (x86)\Microsoft Visual Studio\...\...\VC\Auxiliary\Build\vcvars64.bat"
+where the exact path depends on the version and edition of VS you have installed.
+To install nvdiffrast in your local site-packages, run:
+# Ninja is required run-time to build PyTorch extensions
+pip install ninja
+
+# Run at the root of the repository to install nvdiffrast
+pip install .
+Instead of pip install . you can also just add the repository root directory to your PYTHONPATH.
+Primitive operations
+Nvdiffrast offers four differentiable rendering primitives: rasterization , interpolation , texturing , and antialiasing . The operation of the primitives is described here in a platform-agnostic way. Platform-specific documentation can be found in the API reference section.
+In this section we ignore the minibatch axis for clarity and assume a minibatch size of one. However, all operations support minibatches as detailed later.
+Rasterization
+The rasterization operation takes as inputs a tensor of vertex positions and a tensor of vertex index triplets that specify the triangles. Vertex positions are specified in clip space, i.e., after modelview and projection transformations. Performing these transformations is left as the user's responsibility. In clip space, the view frustum is a cube in homogeneous coordinates where x /w , y /w , z /w are all between -1 and +1.
+The output of the rasterization operation is a 4-channel float32 image with tuple (u , v , z /w , t r i a n g l e _i d ) in each pixel. Values u and v are the barycentric coordinates within a triangle: the first vertex in the vertex index triplet obtains (u , v ) = (1, 0) , the second vertex (u , v ) = (0, 1) and the third vertex (u , v ) = (0, 0) . Normalized depth value z /w is used later by the antialiasing operation to infer occlusion relations between triangles, and it does not propagate gradients to the vertex position input. Field t r i a n g l e _i d is the triangle index, offset by one. Pixels where no triangle was rasterized will receive a zero in all channels.
+Rasterization is point-sampled, i.e., the geometry is not smoothed, blurred, or made partially transparent in any way, in contrast to some previous differentiable rasterizers. The contents of a pixel always represent a single surface point that is on the closest surface visible along the ray through the pixel center.
+Point-sampled coverage does not produce vertex position gradients related to occlusion and visibility effects. This is because the motion of vertices does not change the coverage in a continuous way — a triangle is either rasterized into a pixel or not. In nvdiffrast, the occlusion/visibility related gradients are generated in the antialiasing operation that typically occurs towards the end of the rendering pipeline.
+
+
+
+
+
+[..., 0:2] = barycentrics (u , v )
+
+
+
+
+
+[..., 3] = t r i a n g l e _i d
+
+
+
+
+The images above illustrate the output of the rasterizer. The left image shows the contents of channels 0 and 1, i.e., the barycentric coordinates, rendered as red and green, respectively. The right image shows channel 3, i.e., the triangle ID, using a random color per triangle. Spot model was created and released into public domain by Keenan Crane .
+Interpolation
+Depending on the shading and lighting models, a mesh typically specifies a number of attributes at its vertices. These can include, e.g., texture coordinates, vertex normals, reflection vectors, and material parameters. The purpose of the interpolation operation is to transfer these attributes specified at vertices to image space. In the hardware graphics pipeline, this happens automatically between vertex and pixel shaders. The interpolation operation in nvdiffrast supports an arbitrary number of attributes.
+Concretely, the interpolation operation takes as inputs the buffer produced by the rasterizer and a buffer specifying the vertex attributes. The output is an image-size buffer with as many channels as there are attributes. Pixels where no triangle was rendered will contain all zeros in the output.
+
+
+
+
+
+Texture coordinates (s , t )
+
+
+
+
+Above is an example of interpolated texture coordinates visualized in red and green channels. This image was created using the output of the rasterizer from the previous step, and an attribute buffer containing the texture coordinates.
+Texturing
+Texture sampling is a fundamental operation in hardware graphics pipelines, and the same is true in nvdiffrast. The basic principle is simple: given a per-pixel texture coordinate vector, fetch a value from a texture and place it in the output. In nvdiffrast, the textures may have an arbitrary number of channels, which is useful in case you want to learn, say, an abstract field that acts as an input to a neural network further down the pipeline.
+When sampling a texture, it is typically desirable to use some form of filtering. Most previous differentiable rasterizers support at most bilinear filtering, where sampling at a texture coordinate between texel centers will interpolate the value linearly from the four nearest texels. While this works fine when viewing the texture up close, it yields badly aliased results when the texture is viewed from a distance. To avoid this, the texture needs to be prefiltered prior to sampling it, removing the frequencies that are too high compared to how densely it is being sampled.
+Nvdiffrast supports prefiltered texture sampling based on mipmapping . The required mipmap levels can be generated internally in the texturing operation, so that the user only needs to specify the highest-resolution (base level) texture. Currently the highest-quality filtering mode is isotropic trilinear filtering. The lack of anisotropic filtering means that a texture viewed at a steep angle will not alias in any direction, but it may appear blurry across the non-squished direction.
+In addition to standard 2D textures, the texture sampling operation also supports cube maps. Cube maps are addressed using 3D texture coordinates, and the transitions between cube map faces are properly filtered so there will be no visible seams. Cube maps support trilinear filtering similar to 2D textures. There is no explicit support for 1D textures but they can be simulated efficiently with 1× n textures. All the filtering, mipmapping etc. work with such textures just as they would with true 1D textures. For now there is no support for 3D volume textures.
+
+
+
+
+
+Texture of Spot
+
+
+
+
+
+Output of the texture sampling operation
+
+
+
+
+
+Background replaced with white
+
+
+
+
+The middle image above shows the result of texture sampling using the interpolated texture coordinates from the previous step. Why is the background pink? The texture coordinates (s , t ) read as zero at those pixels, but that is a perfectly valid point to sample the texture. It happens that Spot's texture (left) has pink color at its (0, 0) corner, and therefore all pixels in the background obtain that color as a result of the texture sampling operation. On the right, we have replaced the color of the empty pixels with a white color. Here's one way to do this in PyTorch:
+ img_right = torch.where(rast_out[..., 3 :] > 0 , img_left, torch.tensor(1.0 ).cuda())
+where rast_out is the output of the rasterization operation. We simply test if the t r i a n g l e _i d field, i.e., channel 3 of the rasterizer output, is greater than zero, indicating that a triangle was rendered in that pixel. If so, we take the color from the textured image, and otherwise we take constant 1.0.
+Antialiasing
+The last of the four primitive operations in nvdiffrast is antialiasing. Based on the geometry input (vertex positions and triangles), it will smooth out discontinuties at silhouette edges in a given image. The smoothing is based on a local approximation of coverage — an approximate integral over a pixel is calculated based on the exact location of relevant edges and the point-sampled colors at pixel centers.
+In this context, a silhouette is any edge that connects to just one triangle, or connects two triangles so that one folds behind the other. Specifically, this includes both silhouettes against the background and silhouettes against another surface, unlike some previous methods (DIB-R ) that only support the former kind.
+It is worth discussing why we might want to go through this trouble to improve the image a tiny bit. If we're attempting to, say, match a real-world photograph, a slightly smoother edge probably won't match the captured image much better than a jagged one. However, that is not the point of the antialiasing operation — the real goal is to obtain gradients w.r.t. vertex positions related to occlusion, visibility, and coverage.
+Remember that everything up to this point in the rendering pipeline is point-sampled. In particular, the coverage, i.e., which triangle is rasterized to which pixel, changes discontinuously in the rasterization operation.
+This is the reason why previous differentiable rasterizers apply nonstandard image synthesis model with blur and transparency: Something has to make coverage continuous w.r.t. vertex positions if we wish to optimize vertex positions, camera position, etc., based on an image-space loss. In nvdiffrast, we do everything point-sampled so that we know that every pixel corresponds to a single, well-defined surface point. This lets us perform arbitrary shading computations without worrying about things like accidentally blurring texture coordinates across silhouettes, or having attributes mysteriously tend towards background color when getting close to the edge of the object. Only towards the end of the pipeline, the antialiasing operation ensures that the motion of vertex positions results in continuous change on silhouettes.
+The antialiasing operation supports any number of channels in the image to be antialiased. Thus, if your rendering pipeline produces an abstract representation that is fed to a neural network for further processing, that is not a problem.
+
+
+
+
+
+Antialiased image
+
+
+
+
+
+Closeup, before AA
+
+
+
+
+
+Closeup, after AA
+
+
+
+
+The left image above shows the result image from the last step, after performing antialiasing. The effect is quite small — some boundary pixels become less jagged, as shown in the closeups.
+Notably, not all boundary pixels are antialiased as revealed by the left-side image below. This is because the accuracy of the antialiasing operation in nvdiffrast depends on the rendered size of triangles: Because we store knowledge of just one surface point per pixel, antialiasing is possible only when the triangle that contains the actual geometric silhouette edge is visible in the image. The example image is rendered in very low resolution and the triangles are tiny compared to pixels. Thus, triangles get easily lost between the pixels.
+This results in incomplete-looking antialiasing, and the gradients provided by antialiasing become noisier when edge triangles are missed. Therefore it is advisable to render images in resolutions where the triangles are large enough to show up in the image at least most of the time.
+
+
+
+
+
+Pixels touched by antialiasing, original resolution
+
+
+
+
+
+Rendered in 4×4 higher resolution and downsampled
+
+
+
+
+The left image above shows which pixels were modified by the antialiasing operation in this example. On the right, we performed the rendering in 4×4 higher resolution and downsampled the final images back to the original size. This yields more accurate position gradients related to the silhouettes, so if you suspect your position gradients are too noisy, you may want to try simply increasing the resolution in which rasterization and antialiasing is done.
+For purposes of shape optimization, the sparse-looking situation on the left would probably be perfectly fine. The gradients are still going to point in the right direction even if they are somewhat sparse, and you will need to use some sort of shape regularization anyway, which will greatly increase tolerance to noisy shape gradients.
+Beyond the basics
+Rendering images is easy with nvdiffrast, but there are a few practical things that you will need to take into account. The topics in this section explain the operation and usage of nvdiffrast in more detail, and hopefully help you avoid any potential misunderstandings and pitfalls.
+Coordinate systems
+Nvdiffrast follows OpenGL's coordinate systems and other conventions. This is partially because we support OpenGL to accelerate the rasterization operation, but mostly so that there is a single standard to follow .
+
+
+In OpenGL convention, the perspective projection matrix (as implemented in, e.g., utils.projection() in our samples and glFrustum() in OpenGL) treats the view-space z as increasing towards the viewer. However, after multiplication by perspective projection matrix, the homogeneous clip-space coordinate z /w increases away from the viewer. Hence, a larger depth value in the rasterizer output tensor also corresponds to a surface further away from the viewer.
+
+
+The memory order of image data in OpenGL, and consequently in nvdiffrast, is bottom-up. This means that row 0 of a tensor containing an image is the bottom row of the texture/image, which is the opposite of the more common scanline order. If you want to keep your image data in the conventional top-down order in your code, but have it logically the right way up inside nvdiffrast, you will need to flip the images vertically when crossing the boundary.
+
+
+For 2D textures, the coordinate origin (s , t ) = (0, 0) is at the bottom left corner with s increasing to the right and t increasing to the top. When specifying the faces of a cube map texture, the orientation varies between the faces, but nvdiffrast follows the OpenGL convention here as well.
+
+
+As a word of advice, it is best to stay on top of coordinate systems and orientations used in your program. When something appears to be the wrong way around, it is much better to identify and fix the root cause than to randomly flip coordinates, images, buffers, and matrices until the immediate problem goes away.
+Geometry and minibatches: Range mode vs Instanced mode
+As mentioned earlier, all operations in nvdiffrast support the minibatch axis efficiently. Related to this, we support two ways for representing the geometry: range mode and instanced mode . If you want to render a different mesh in each minibatch index, you need to use the range mode. However, if you are rendering the same mesh, but with potentially different viewpoints, vertex positions, attributes, textures, etc., in each minibatch index, the instanced mode will be much more convenient.
+In range mode , you specify triangle index triplets as a 2D tensor of shape [num_triangles , 3], and vertex positions as a 2D tensor of shape [num_vertices , 4]. In addition to these, the rasterization operation requires an additional 2D range tensor of shape [minibatch_size , 2] where each row specifies a start index and count into the triangle tensor. As a result, the rasterizer will render the triangles in the specified ranges into each minibatch index of the output tensor. If you have multiple meshes, you should place all of them into the vertex and triangle tensors, and then choose which mesh to rasterize into each minibatch index via the contents of the range tensor. The attribute tensor in interpolation operation is handled in the same way as positions, and it has to be of shape [num_vertices , num_attributes ] in range mode.
+In instanced mode , the topology of the mesh will be shared for each minibatch index. The triangle tensor is still a 2D tensor with shape [num_triangles , 3], but the vertex positions are specified using a 3D tensor of shape [minibatch_size , num_vertices , 4]. With a 3D vertex position tensor, the rasterizer will not require the range tensor input, but will take the minibatch size from the first dimension of the vertex position tensor. The same triangles are rendered to each minibatch index, but with vertex positions taken from the corresponding slice of the vertex position tensor. In this mode, the attribute tensor in interpolation has to be a 3D tensor similar to position tensor, i.e., of shape [minibatch_size , num_vertices , num_attributes ]. However, you can provide an attribute tensor with minibatch size of 1, and it will be broadcast across the minibatch.
+Image-space derivatives
+We skirted around a pretty fundamental question in the description of the texturing operation above. In order to determine the proper amount of prefiltering for sampling a texture, we need to know how densely it is being sampled. But how can we know the sampling density when each pixel knows of a just a single surface point?
+The solution is to track the image-space derivatives of all things leading up to the texture sampling operation. These are not the same thing as the gradients used in the backward pass , even though they both involve differentiation! Consider the barycentrics (u , v ) produced by the rasterization operation. They change by some amount when moving horizontally or vertically in the image plane. If we denote the image-space coordinates as (X , Y ) , the image-space derivatives of the barycentrics would be ∂u /∂X , ∂u /∂Y , ∂v /∂X , and ∂v /∂Y . We can organize these into a 2×2 Jacobian matrix that describes the local relationship between (u , v ) and (X , Y ) . This matrix is generally different at every pixel. For the purpose of image-space derivatives, the units of X and Y are pixels. Hence, ∂u /∂X is the local approximation of how much u changes when moving a distance of one pixel in the horizontal direction, and so on.
+Once we know how the barycentrics change w.r.t. pixel position, the interpolation operation can use this to determine how the attributes change w.r.t. pixel position. When attributes are used as texture coordinates, we can therefore tell how the texture sampling position (in texture space) changes when moving around within the pixel (up to a local, linear approximation, that is). This texture footprint tells us the scale on which the texture should be prefiltered. In more practical terms, it tells us which mipmap level(s) to use when sampling the texture.
+In nvdiffrast, the rasterization operation outputs the image-space derivatives of the barycentrics in an auxiliary 4-channel output tensor, ordered (∂u /∂X , ∂u /∂Y , ∂v /∂X , ∂v /∂Y ) from channel 0 to 3. The interpolation operation can take this auxiliary tensor as input and compute image-space derivatives of any set of attributes being interpolated. Finally, the texture sampling operation can use the image-space derivatives of the texture coordinates to determine the amount of prefiltering.
+There is nothing magic about these image-space derivatives. They are tensors like the, e.g., the texture coordinates themselves, they propagate gradients backwards, and so on. For example, if you want to artificially blur or sharpen the texture when sampling it, you can simply multiply the tensor carrying the image-space derivatives of the texture coordinates ∂{s , t }/∂{X , Y } by a scalar value before feeding it into the texture sampling operation. This scales the texture footprints and thus adjusts the amount of prefiltering. If your loss function prefers a different level of sharpness, this multiplier will receive a nonzero gradient. Update: Since version 0.2.1, the texture sampling operation also supports a separate mip level bias input that would be better suited for this particular task, but the gist is the same nonetheless.
+One might wonder if it would have been easier to determine the texture footprints simply from the texture coordinates in adjacent pixels, and skip all this derivative rubbish? In easy cases the answer is yes, but silhouettes, occlusions, and discontinuous texture parameterizations would make this approach rather unreliable in practice. Computing the image-space derivatives analytically keeps everything point-like, local, and well-behaved.
+It should be noted that computing gradients related to image-space derivatives is somewhat involved and requires additional computation. At the same time, they are often not crucial for the convergence of the training/optimization. Because of this, the primitive operations in nvdiffrast offer options to disable the calculation of these gradients. We're talking about things like ∂L o s s /∂(∂{u , v }/∂{X , Y }) that may look second-order-ish, but they're not.
+Mipmaps and texture dimensions
+Prefiltered texture sampling modes require mipmaps , i.e., downsampled versions, of the texture. The texture sampling operation can construct these internally, or you can provide your own mipmap stack, but there are limits to texture dimensions that need to be considered.
+When mipmaps are constructed internally, each mipmap level is constructed by averaging 2×2 pixel patches of the preceding level (or of the texture itself for the first mipmap level). The size of the buffer to be averaged therefore has to be divisible by 2 in both directions. There is one exception: side length of 1 is valid, and it will remain as 1 in the downsampling operation.
+For example, a 32×32 texture will produce the following mipmap stack:
+
+
+
+
+32×32
+
+
+→
+
+
+16×16
+
+
+→
+
+
+8×8
+
+
+→
+
+
+4×4
+
+
+→
+
+
+2×2
+
+
+→
+
+
+1×1
+
+
+
+
+Base texture
+
+
+Mip level 1
+
+
+Mip level 2
+
+
+Mip level 3
+
+
+Mip level 4
+
+
+Mip level 5
+
+
+
+
+And a 32×8 texture, with both sides powers of two but not equal, will result in:
+
+
+
+
+32×8
+
+
+→
+
+
+16×4
+
+
+→
+
+
+8×2
+
+
+→
+
+
+4×1
+
+
+→
+
+
+2×1
+
+
+→
+
+
+1×1
+
+
+
+
+Base texture
+
+
+Mip level 1
+
+
+Mip level 2
+
+
+Mip level 3
+
+
+Mip level 4
+
+
+Mip level 5
+
+
+
+
+For texture sizes like this, everything will work automatically and mipmaps are constructed down to 1×1 pixel size. Therefore, if you wish to use prefiltered texture sampling, you should scale your textures to power-of-two dimensions that do not, however, need to be equal.
+How about texture atlases? You may have an object whose texture is composed of multiple individual patches, or a collection of textured meshes with a unique texture for each. Say we have a texture atlas composed of five 32×32 sub-images, i.e., a total size of 160×32 pixels. Now we cannot compute mipmap levels all the way down to 1×1 size, because there is a 5×1 mipmap in the way that cannot be downsampled (because 5 is not even):
+
+
+
+
+160×32
+
+
+→
+
+
+80×16
+
+
+→
+
+
+40×8
+
+
+→
+
+
+20×4
+
+
+→
+
+
+10×2
+
+
+→
+
+
+5 ×1
+
+
+→
+
+
+Error!
+
+
+
+
+Base texture
+
+
+Mip level 1
+
+
+Mip level 2
+
+
+Mip level 3
+
+
+Mip level 4
+
+
+Mip level 5
+
+
+
+
+Scaling the atlas to, say, 256×32 pixels would feel silly because the dimensions of the sub-images are perfectly fine, and downsampling the different sub-images together — which would happen after the 5×1 resolution — would not make sense anyway. For this reason, the texture sampling operation allows the user to specify the maximum number of mipmap levels to be constructed and used. In this case, setting max_mip_level=5 would stop at the 5×1 mipmap and prevent the error.
+It is a deliberate design choice that nvdiffrast doesn't just stop automatically at a mipmap size it cannot downsample, but requires the user to specify a limit when the texture dimensions are not powers of two. The goal is to avoid bugs where prefiltered texture sampling mysteriously doesn't work due to an oddly sized texture. It would be confusing if a 256×256 texture gave beautifully prefiltered texture samples, a 255×255 texture suddenly had no prefiltering at all, and a 254×254 texture did just a bit of prefiltering (one level) but not more.
+If you compute your own mipmaps, their sizes must follow the scheme described above. There is no need to specify mipmaps all the way to 1×1 resolution, but the stack can end at any point and it will work equivalently to an internally constructed mipmap stack with a max_mip_level limit. Importantly, the gradients of user-provided mipmaps are not propagated automatically to the base texture — naturally so, because nvdiffrast knows nothing about the relation between them. Instead, the tensors that specify the mip levels in a user-provided mipmap stack will receive gradients of their own.
+Rasterizing with CUDA vs OpenGL
+Since version 0.3.0, nvdiffrast on PyTorch supports executing the rasterization operation using either CUDA or OpenGL. Earlier versions and the Tensorflow bindings support OpenGL only.
+When rasterization is executed on OpenGL, we use the GPU's graphics pipeline to determine which triangles land on which pixels. GPUs have amazingly efficient hardware for doing this — it is their original raison d'être — and thus it makes sense to exploit it. Unfortunately, some computing environments haven't been designed with this in mind, and it can be difficult to get OpenGL to work correctly and interoperate with CUDA cleanly. On Windows, compatibility is generally good because the GPU drivers required to run CUDA also include OpenGL support. Linux is more complicated, as various drivers can be installed separately and there isn't a standardized way to acquire access to the hardware graphics pipeline.
+Rasterizing in CUDA pretty much reverses these considerations. Compatibility is obviously not an issue on any CUDA-enabled platform. On the other hand, implementing the rasterization process correctly and efficiently on a massively data-parallel programming model is non-trivial. The CUDA rasterizer in nvdiffrast follows the approach described in research paper High-Performance Software Rasterization on GPUs by Laine and Karras, HPG 2011. Our code is based on the paper's publicly released CUDA kernels, with considerable modifications to support current hardware architectures and to match nvdiffrast's needs.
+The subpixel precision of the CUDA rasterizer is limited to 4 bits, and depth peeling is less accurate than with OpenGL. Memory consumption depends on many factors. Note: Restrictions related to output resolution have been removed in version 0.3.3. Although the internal resolution of the CUDA rasterizer remains capped at 2048×2048, nvdiffrast now invokes it automatically multiple times to support higher resolutions.
+It is difficult to predict which rasterizer offers better performance. For complex meshes and high resolutions OpenGL will most likely outperform the CUDA rasterizer, although it has certain overheads that the CUDA rasterizer does not have. For simple meshes and low resolutions the CUDA rasterizer may be faster, but it has its own overheads, too. Measuring the performance on actual data, on the target platform, and in the context of the entire program is the only way to know for sure.
+To run rasterization in CUDA, create a RasterizeCudaContext and supply it to the rasterize() operation. For OpenGL, use a RasterizeGLContext instead. Easy!
+Running on multiple GPUs
+Nvdiffrast supports computation on multiple GPUs in both PyTorch and TensorFlow. As is the convention in PyTorch, the operations are always executed on the device on which the input tensors reside. All GPU input tensors must reside on the same device, and the output tensors will unsurprisingly end up on that same device. In addition, the rasterization operation requires that its context was created for the correct device. In TensorFlow, the rasterizer context is automatically created on the device of the rasterization operation when it is executed for the first time.
+The remainder of this section applies only to OpenGL rasterizer contexts. CUDA rasterizer contexts require no special considerations besides making sure they're on the correct device.
+On Windows, nvdiffrast implements OpenGL device selection in a way that can be done only once per process — after one context is created, all future ones will end up on the same GPU. Hence you cannot expect to run the rasterization operation on multiple GPUs within the same process using an OpenGL context. Trying to do so will either cause a crash or incur a significant performance penalty. However, with PyTorch it is common to distribute computation across GPUs by launching a separate process for each GPU, so this is not a huge concern. Note that any OpenGL context created within the same process, even for something like a GUI window, will prevent changing the device later. Therefore, if you want to run the rasterization operation on other than the default GPU, be sure to create its OpenGL context before initializing any other OpenGL-powered libraries.
+On Linux everything just works, and you can create OpenGL rasterizer contexts on multiple devices within the same process.
+Note on torch.nn.DataParallel
+PyTorch offers torch.nn.DataParallel wrapper class for splitting the execution of a minibatch across multiple threads. Unfortunately, this class is fundamentally incompatible with OpenGL-dependent operations, as it spawns a new set of threads at each call (as of PyTorch 1.9.0, at least). Using previously created OpenGL contexts in these new threads, even if taking care to not use the same context in multiple threads, causes them to be migrated around and this has resulted in ever-growing GPU memory usage and abysmal GPU utilization. Therefore, we advise against using torch.nn.DataParallel for rasterization operations that depend on the OpenGL contexts.
+Notably, torch.nn.DistributedDataParallel spawns subprocesses that are much more persistent. The subprocesses must create their own OpenGL contexts as part of initialization, and as such they do not suffer from this problem.
+GitHub issue #23 , especially this comment , contains further analysis and suggestions for workarounds.
+Rendering multiple depth layers
+Sometimes there is a need to render scenes with partially transparent surfaces. In this case, it is not sufficient to find only the surfaces that are closest to the camera, as you may also need to know what lies behind them. For this purpose, nvdiffrast supports depth peeling that lets you extract multiple closest surfaces for each pixel.
+With depth peeling, we start by rasterizing the closest surfaces as usual. We then perform a second rasterization pass with the same geometry, but this time we cull all previously rendered surface points at each pixel, effectively extracting the second-closest depth layer. This can be repeated as many times as desired, so that we can extract as many depth layers as we like. See the images below for example results of depth peeling with each depth layer shaded and antialiased.
+
+
+
+
+
+First depth layer
+
+
+
+
+
+Second depth layer
+
+
+
+
+
+Third depth layer
+
+
+
+
+The API for depth peeling is based on DepthPeeler object that acts as a context manager , and its rasterize_next_layer method. The first call to rasterize_next_layer is equivalent to calling the traditional rasterize function, and subsequent calls report further depth layers. The arguments for rasterization are specified when instantiating the DepthPeeler object. Concretely, your code might look something like this:
+with nvdiffrast.torch.DepthPeeler(glctx, pos, tri, resolution) as peeler:
+ for i in range (num_layers):
+ rast, rast_db = peeler.rasterize_next_layer()
+ (process or store the results)
+There is no performance penalty compared to the basic rasterization op if you end up extracting only the first depth layer. In other words, the code above with num_layers=1 runs exactly as fast as calling rasterize once.
+Depth peeling is only supported in the PyTorch version of nvdiffrast. For implementation reasons, depth peeling reserves the rasterizer context so that other rasterization operations cannot be performed while the peeling is ongoing, i.e., inside the with block. Hence you cannot start a nested depth peeling operation or call rasterize inside the with block unless you use a different context.
+For the sake of completeness, let us note the following small caveat: Depth peeling relies on depth values to distinguish surface points from each other. Therefore, culling "previously rendered surface points" actually means culling all surface points at the same or closer depth as those rendered into the pixel in previous passes. This matters only if you have multiple layers of geometry at matching depths — if your geometry consists of, say, nothing but two exactly overlapping triangles, you will see one of them in the first pass but never see the other one in subsequent passes, as it's at the exact depth that is already considered done.
+Differences between PyTorch and TensorFlow
+Nvdiffrast can be used from PyTorch and from TensorFlow 1.x; the latter may change to TensorFlow 2.x if there is demand. These frameworks operate somewhat differently and that is reflected in the respective APIs. Simplifying a bit, in TensorFlow 1.x you construct a persistent graph out of persistent nodes, and run many batches of data through it. In PyTorch, there is no persistent graph or nodes, but a new, ephemeral graph is constructed for each batch of data and destroyed immediately afterwards. Therefore, there is also no persistent state for the operations. There is the torch.nn.Module abstraction for festooning operations with persistent state, but we do not use it.
+As a consequence, things that would be part of persistent state of an nvdiffrast operation in TensorFlow must be stored by the user in PyTorch, and supplied to the operations as needed. In practice, this is a very small difference and amounts to just a couple of lines of code in most cases.
+As an example, consider the OpenGL context used by the rasterization operation. In order to use hardware-accelerated rendering, an OpenGL context must be created and switched into before issuing OpenGL commands internally. Creating the context is an expensive operation, so we don't want to create and destroy one at every call of the rasterization operation. In TensorFlow, the rasterization operation creates a context when it is executed for the first time, and stashes it away in its persistent state to be reused later. In PyTorch, the user has to create the context using a separate function call, and supply it as a parameter to the rasterization operation.
+Similarly, if you have a constant texture and want to use prefiltered texture sampling modes, the mipmap stack only needs to be computed once. In TensorFlow, you can specify that the texture is constant, in which case the texture sampling operation only computes the mipmap stack on the first execution and stores it internally. In PyTorch, you can compute the mipmap stack once using a separate function call, and supply it to the texture sampling operation every time. If you don't do that, the operation will compute the mipmap stack internally and discard it afterwards. This is exactly what you want if your texture changes at every iteration, and it's not wrong even if the texture is constant, just a bit inefficient.
+Finally, the same holds for a thing called the topology hash that the antialiasing operation uses for identifying potential silhouette edges. Its contents depend only on the triangle tensor, not the vertex positions, so if the topology is constant, this auxiliary structure needs to be constructed only once. As before, in TensorFlow this is handled internally, whereas in PyTorch a separate function is provided for off-line construction.
+Manual OpenGL contexts in PyTorch
+First, please note that handling OpenGL contexts manually is a very small optimization. It almost certainly won't be relevant unless you've already profiled and optimized your code with gusto , and you're on a mission to extract every last bit of performance possible.
+In TensorFlow, the only option is to let nvdiffrast handle the OpenGL context management internally. This is because TensorFlow utilizes multiple CPU threads under the hood, and the active OpenGL context is a thread-local resource.
+PyTorch isn't as unpredictable, and stays in the same CPU thread by default (although things like torch.utils.data.DataLoader do invoke additional CPU threads). As such, nvdiffrast lets the user choose between handling OpenGL context switching in automatic or manual mode. The default is automatic mode where the rasterization operation always sets/releases the context at the beginning/end of each execution, like we do in TensorFlow. This ensures that the rasterizer will always use the context that you supply, and the context won't remain active so nobody else can mess with it.
+In manual mode, the user assumes the responsibility of setting and releasing the OpenGL context. Most of the time, if you don't have any other libraries that would be using OpenGL, you can just set the context once after having created it and keep it set until the program exits. However, keep in mind that the active OpenGL context is a thread-local resource, so it needs to be set in the same CPU thread as it will be used, and it cannot be set simultaneously in multiple CPU threads.
+Samples
+Nvdiffrast comes with a set of samples that were crafted to support the research paper. Each sample is available in both PyTorch and TensorFlow versions. Details such as command-line parameters, logging format, etc., may not be identical between the versions, and generally the PyTorch versions should be considered definitive. The command-line examples below are for the PyTorch versions.
+All PyTorch samples support selecting between CUDA and OpenGL rasterizer contexts. The default is to do rasterization in CUDA, and switching to OpenGL is done by specifying command-line option --opengl.
+Enabling interactive display using the --display-interval parameter is likely to fail on Linux when using OpenGL rasterization. This is because the interactive display window is shown using OpenGL, and on Linux this conflicts with the internal OpenGL rasterization in nvdiffrast. Using a CUDA context should work, assuming that OpenGL is correctly installed in the system (for displaying the window). Our Dockerfile is set up to support headless rendering only, and thus cannot show an interactive result window.
+
+This is a minimal sample that renders a triangle and saves the resulting image into a file (tri.png) in the current directory. Running this should be the first step to verify that you have everything set up correctly. Rendering is done using the rasterization and interpolation operations, so getting the correct output image means that both OpenGL (if specified on command line) and CUDA are working as intended under the hood.
+This is the only sample where you must specify either --cuda or --opengl on command line. Other samples default to CUDA rasterization and provide only the --opengl option.
+Example command lines:
+python triangle.py --cuda
+python triangle.py --opengl
+
+
+
+
+
+The expected output image
+
+
+
+
+
+In this sample, we optimize the vertex positions and colors of a cube mesh, starting from a semi-randomly initialized state. The optimization is based on image-space loss in extremely low resolutions such as 4×4, 8×8, or 16×16 pixels. The goal of this sample is to examine the rate of geometrical convergence when the triangles are only a few pixels in size. It serves to illustrate that the antialiasing operation, despite being approximative, yields good enough position gradients even in 4×4 resolution to guide the optimization to the goal.
+Example command line:
+python cube.py --resolution 16 --display-interval 10
+
+
+
+
+
+Interactive view of cube.py
+
+
+
+
+
+Rendering pipeline
+
+
+
+
+The image above shows a live view of the sample. Top row shows the low-resolution rendered image and reference image that the image-space loss is calculated from. Bottom row shows the current mesh (and colors) and reference mesh in high resolution so that convergence can be seen more easily visually.
+In the pipeline diagram, green boxes indicate nvdiffrast operations, whereas blue boxes are other computation. Red boxes are the learned tensors and gray are non-learned tensors or other data.
+
+The goal of this sample is to compare texture convergence with and without prefiltered texture sampling. The texture is learned based on image-space loss against high-quality reference renderings in random orientations and at random distances. When prefiltering is disabled, the texture is not learned properly because of spotty gradient updates caused by aliasing. This shows as a much worse PSNR for the texture, compared to learning with prefiltering enabled. See the paper for further discussion.
+Example command lines:
+
+
+
+python earth.py --display-interval 10
+
+
+No prefiltering, bilinear interpolation.
+
+
+
+
+python earth.py --display-interval 10 --mip
+
+
+Prefiltering enabled, trilinear interpolation.
+
+
+
+
+
+
+
+
+Interactive view of earth.py, prefiltering disabled
+
+
+
+
+
+Rendering pipeline
+
+
+
+
+The interactive view shows the current texture mapped onto the mesh, with or without prefiltered texture sampling as specified via the command-line parameter. In this sample, no antialiasing is performed because we are not learning vertex positions and hence need no gradients related to them.
+
+In this sample, a more complex shading model is used compared to the vertex colors or plain texture in the previous ones. Here, we learn a reflected environment map and parameters of a Phong BRDF model given a known mesh. The optimization is based on image-space loss against reference renderings in random orientations. The shading model of mirror reflection plus a Phong BRDF is not physically sensible, but it works as a reasonably simple strawman that would not be possible to implement with previous differentiable rasterizers that bundle rasterization, shading, lighting, and texturing together. The sample also illustrates the use of cube mapping for representing a learned texture in a spherical domain.
+Example command line:
+python envphong.py --display-interval 10
+
+
+
+
+
+Interactive view of envphong.py
+
+
+
+
+
+Rendering pipeline
+
+
+
+
+In the interactive view, we see the rendering with the current environment map and Phong BRDF parameters, both gradually improving during the optimization.
+
+Pose fitting based on an image-space loss is a classical task in differentiable rendering. In this sample, we solve a pose optimization problem with a simple cube with differently colored sides. We detail the optimization method in the paper, but in brief, it combines gradient-free greedy optimization in an initialization phase and gradient-based optimization in a fine-tuning phase.
+Example command line:
+python pose.py --display-interval 10
+
+
+
+
+
+Interactive view of pose.py
+
+
+
+
+The interactive view shows, from left to right: target pose, best found pose, and current pose. When viewed live, the two stages of optimization are clearly visible. In the first phase, the best pose updates intermittently when a better initialization is found. In the second phase, the solution converges smoothly to the target via gradient-based optimization.
+PyTorch API reference
+
+
nvdiffrast.torch.RasterizeCudaContext(device =None ) Class
+
Create a new Cuda rasterizer context.
The context is deleted and internal storage is released when the object is
+destroyed.
Arguments:
device Cuda device on which the context is created. Type can be
+torch.device, string (e.g., 'cuda:1'), or int. If not
+specified, context will be created on currently active Cuda
+device.
Returns:
The newly created Cuda rasterizer context.
+
nvdiffrast.torch.RasterizeGLContext(output_db =True , mode ='automatic' , device =None ) Class
+
Create a new OpenGL rasterizer context.
Creating an OpenGL context is a slow operation so you should usually reuse the same
+context in all calls to rasterize() on the same CPU thread. The OpenGL context
+is deleted when the object is destroyed.
Side note: When using the OpenGL context in a rasterization operation, the
+context's internal framebuffer object is automatically enlarged to accommodate the
+rasterization operation's output shape, but it is never shrunk in size until the
+context is destroyed. Thus, if you need to rasterize, say, deep low-resolution
+tensors and also shallow high-resolution tensors, you can conserve GPU memory by
+creating two separate OpenGL contexts for these tasks. In this scenario, using the
+same OpenGL context for both tasks would end up reserving GPU memory for a deep,
+high-resolution output tensor.
Arguments:
output_db Compute and output image-space derivates of barycentrics. mode OpenGL context handling mode. Valid values are 'manual' and 'automatic'. device Cuda device on which the context is created. Type can be
+torch.device, string (e.g., 'cuda:1'), or int. If not
+specified, context will be created on currently active Cuda
+device.
Methods, only available if context was created in manual mode:
set_context() Set (activate) OpenGL context in the current CPU thread. release_context() Release (deactivate) currently active OpenGL context.
Returns:
The newly created OpenGL rasterizer context.
+
nvdiffrast.torch.rasterize(glctx , pos , tri , resolution , ranges =None , grad_db =True ) Function
+
Rasterize triangles.
All input tensors must be contiguous and reside in GPU memory except for
+the ranges tensor that, if specified, has to reside in CPU memory. The
+output tensors will be contiguous and reside in GPU memory.
Arguments:
glctx Rasterizer context of type RasterizeGLContext or RasterizeCudaContext. pos Vertex position tensor with dtype torch.float32. To enable range
+mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+instanced mode, use a 3D shape [minibatch_size, num_vertices, 4]. tri Triangle tensor with shape [num_triangles, 3] and dtype torch.int32. resolution Output resolution as integer tuple (height, width). ranges In range mode, tensor with shape [minibatch_size, 2] and dtype
+torch.int32, specifying start indices and counts into tri.
+Ignored in instanced mode. grad_db Propagate gradients of image-space derivatives of barycentrics
+into pos in backward pass. Ignored if using an OpenGL context that
+was not configured to output image-space derivatives.
Returns:
A tuple of two tensors. The first output tensor has shape [minibatch_size,
+height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+triangle_id). If the OpenGL context was configured to output image-space
+derivatives of barycentrics, the second output tensor will also have shape
+[minibatch_size, height, width, 4] and contain said derivatives in order
+(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+[minibatch_size, height, width, 0].
+
nvdiffrast.torch.DepthPeeler(... ) Class
+
Create a depth peeler object for rasterizing multiple depth layers.
Arguments are the same as in rasterize().
Returns:
The newly created depth peeler.
+
nvdiffrast.torch.DepthPeeler.rasterize_next_layer() Method
+
Rasterize next depth layer.
Operation is equivalent to rasterize() except that previously reported
+surface points are culled away.
Returns:
A tuple of two tensors as in rasterize().
+
nvdiffrast.torch.interpolate(attr , rast , tri , rast_db =None , diff_attrs =None ) Function
+
Interpolate vertex attributes.
All input tensors must be contiguous and reside in GPU memory. The output tensors
+will be contiguous and reside in GPU memory.
Arguments:
attr Attribute tensor with dtype torch.float32.
+Shape is [num_vertices, num_attributes] in range mode, or
+[minibatch_size, num_vertices, num_attributes] in instanced mode.
+Broadcasting is supported along the minibatch axis. rast Main output tensor from rasterize(). tri Triangle tensor with shape [num_triangles, 3] and dtype torch.int32. rast_db (Optional) Tensor containing image-space derivatives of barycentrics,
+i.e., the second output tensor from rasterize(). Enables computing
+image-space derivatives of attributes. diff_attrs (Optional) List of attribute indices for which image-space
+derivatives are to be computed. Special value 'all' is equivalent
+to list [0, 1, ..., num_attributes - 1].
Returns:
A tuple of two tensors. The first output tensor contains interpolated
+attributes and has shape [minibatch_size, height, width, num_attributes].
+If rast_db and diff_attrs were specified, the second output tensor contains
+the image-space derivatives of the selected attributes and has shape
+[minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+Otherwise, the second output tensor will be an empty tensor with shape
+[minibatch_size, height, width, 0].
+
nvdiffrast.torch.texture(tex , uv , uv_da =None , mip_level_bias =None , mip =None , filter_mode ='auto' , boundary_mode ='wrap' , max_mip_level =None ) Function
+
Perform texture sampling.
All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.
Arguments:
tex Texture tensor with dtype torch.float32. For 2D textures, must have shape
+[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+tex_width and tex_height are equal. Note that boundary_mode must also be set
+to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis. uv Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+texture, must have shape [minibatch_size, height, width, 3]. uv_da (Optional) Tensor containing image-space derivatives of texture coordinates.
+Must have same shape as uv except for the last dimension that is to be twice
+as long. mip_level_bias (Optional) Per-pixel bias for mip level selection. If uv_da is omitted,
+determines mip level directly. Must have shape [minibatch_size, height, width]. mip (Optional) Preconstructed mipmap stack from a texture_construct_mip() call, or a list
+of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+the tensors in the list must follow the same format as tex except for width and
+height that must follow the usual rules for mipmap sizes. The base level texture
+is still supplied in tex and must not be included in the list. Gradients of a
+custom mipmap stack are not automatically propagated to base texture but the mipmap
+tensors will receive gradients of their own. If a mipmap stack is not specified
+but the chosen filter mode requires it, the mipmap stack is constructed internally
+and discarded afterwards. filter_mode Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+selects 'linear' if neither uv_da or mip_level_bias is specified, and
+'linear-mipmap-linear' when at least one of them is specified, these being
+the highest-quality modes possible depending on the availability of the
+image-space derivatives of the texture coordinates or direct mip level information. boundary_mode Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If tex defines a
+cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+centers of the boundary texels. Mode 'zero' virtually extends the texture with
+all-zero values in all directions. max_mip_level If specified, limits the number of mipmaps constructed and used in mipmap-based
+filter modes.
Returns:
A tensor containing the results of the texture sampling with shape
+[minibatch_size, height, width, tex_channels]. Cube map fetches with invalid uv coordinates
+(e.g., zero vectors) output all zeros and do not propagate gradients.
+
nvdiffrast.torch.texture_construct_mip(tex , max_mip_level =None , cube_mode =False ) Function
+
Construct a mipmap stack for a texture.
This function can be used for constructing a mipmap stack for a texture that is known to remain
+constant. This avoids reconstructing it every time texture() is called.
Arguments:
tex Texture tensor with the same constraints as in texture(). max_mip_level If specified, limits the number of mipmaps constructed. cube_mode Must be set to True if tex specifies a cube map texture.
Returns:
An opaque object containing the mipmap stack. This can be supplied in a call to texture()
+in the mip argument.
+
nvdiffrast.torch.antialias(color , rast , pos , tri , topology_hash =None , pos_gradient_boost =1.0 ) Function
+
Perform antialiasing.
All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.
Note that silhouette edge determination is based on vertex indices in the triangle
+tensor. For it to work properly, a vertex belonging to multiple triangles must be
+referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
+classify the adjacent edges as silhouette edges, which leads to bad performance and
+potentially incorrect gradients. If you are unsure whether your data is good, check
+which pixels are modified by the antialias operation and compare to the example in the
+documentation.
Arguments:
color Input image to antialias with shape [minibatch_size, height, width, num_channels]. rast Main output tensor from rasterize(). pos Vertex position tensor used in the rasterization operation. tri Triangle tensor used in the rasterization operation. topology_hash (Optional) Preconstructed topology hash for the triangle tensor. If not
+specified, the topology hash is constructed internally and discarded afterwards. pos_gradient_boost (Optional) Multiplier for gradients propagated to pos.
Returns:
A tensor containing the antialiased image with the same shape as color input tensor.
+
nvdiffrast.torch.antialias_construct_topology_hash(tri ) Function
+
Construct a topology hash for a triangle tensor.
This function can be used for constructing a topology hash for a triangle tensor that is
+known to remain constant. This avoids reconstructing it every time antialias() is called.
Arguments:
tri Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+GPU memory.
Returns:
An opaque object containing the topology hash. This can be supplied in a call to
+antialias() in the topology_hash argument.
+
nvdiffrast.torch.get_log_level( ) Function
+
Get current log level.
Returns:
Current log level in nvdiffrast. See set_log_level() for possible values.
+
nvdiffrast.torch.set_log_level(level ) Function
+
Set log level.
Log levels follow the convention on the C++ side of Torch:
+ 0 = Info,
+ 1 = Warning,
+ 2 = Error,
+ 3 = Fatal.
+The default log level is 1.
Arguments:
level New log level as integer. Internal nvdiffrast messages of this
+severity or higher will be printed, while messages of lower
+severity will be silent.
+
+
+Licenses
+Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.
+This work is made available under the Nvidia Source Code License .
+For business inquiries, please visit our website and submit the form: NVIDIA Research Licensing
+We do not currently accept outside contributions in the form of pull requests.
+Environment map stored as part of samples/data/envphong.npz is derived from a Wave Engine sample material originally shared under MIT License . Mesh and texture stored as part of samples/data/earth.npz are derived from 3D Earth Photorealistic 2K model originally made available under TurboSquid 3D Model License .
+Citation
+@article{Laine2020diffrast,
+ title = {Modular Primitives for High-Performance Differentiable Rendering},
+ author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+ journal = {ACM Transactions on Graphics},
+ year = {2020},
+ volume = {39},
+ number = {6}
+}
+Acknowledgements
+We thank David Luebke, Simon Yuen, Jaewoo Seo, Tero Kuosmanen, Sanja Fidler, Wenzheng Chen, Jacob Munkberg, Jon Hasselgren, and Onni Kosomaa for discussions, test data, support with compute infrastructure, testing, reviewing, and suggestions for features and improvements.
+
+
+
+
+
+
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/__init__.py b/LAM_gpro/external/nvdiffrast/nvdiffrast/__init__.py
new file mode 100644
index 0000000..fd28a08
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.3.3'
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.cu
new file mode 100644
index 0000000..95cc3ba
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.cu
@@ -0,0 +1,558 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+ bool g10 = rational_gt(n1, n0, d1, d0);
+ bool g20 = rational_gt(n2, n0, d2, d0);
+ bool g21 = rational_gt(n2, n1, d2, d1);
+ if (g20 && g21) return 2;
+ if (g10) return 1;
+ return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+ enum
+ {
+ EDGE_MASK = 3, // Edge index in lowest bits.
+ FLAG_DOWN_BIT = 2, // Down instead of right.
+ FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
+ };
+
+ int px, py; // Pixel x, y.
+ unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
+ float alpha; // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+ a -= b; a -= c; a ^= (c>>13);
+ b -= c; b -= a; b ^= (a<<8);
+ c -= a; c -= b; c ^= (b>>13);
+ a -= b; a -= c; a ^= (c>>12);
+ b -= c; b -= a; b ^= (a<<16);
+ c -= a; c -= b; c ^= (b>>5);
+ a -= b; a -= c; a ^= (c>>3);
+ b -= c; b -= a; b ^= (a<<10);
+ c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+ __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+ {
+ m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
+ m_idx = (uint32_t)(key & 0xffffffffu);
+ m_skip = (uint32_t)(key >> 32);
+ uint32_t dummy = JENKINS_MAGIC;
+ jenkins_mix(m_idx, m_skip, dummy);
+ m_idx &= m_mask;
+ m_skip &= m_mask;
+ m_skip |= 1;
+ }
+ __device__ __forceinline__ int get(void) const { return m_idx; }
+ __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+ uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+ HashIndex idx(p, key);
+ while(1)
+ {
+ uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+ if (prev == 0 || prev == key)
+ break;
+ idx.next();
+ }
+ int* q = (int*)&p.evHash[idx.get()];
+ int a = atomicCAS(q+2, 0, v);
+ if (a != 0 && a != v)
+ atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+ HashIndex idx(p, key);
+ while(1)
+ {
+ uint4 entry = p.evHash[idx.get()];
+ uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+ if (k == key || k == 0)
+ return make_int2((int)entry.z, (int)entry.w);
+ idx.next();
+ }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+ if (va == vb)
+ return;
+
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
+ uint64_t vk = v0 | (v1 << 32); // hash key
+ hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+ if (va == vb)
+ return -1;
+
+ uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+ uint64_t v1 = (uint32_t)max(va, vb) + 1;
+ uint64_t vk = v0 | (v1 << 32); // hash key
+ int2 vn = hash_find(p, vk) - 1;
+ if (vn.x == vr) return vn.y;
+ if (vn.y == vr) return vn.x;
+ return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx >= p.numTriangles)
+ return;
+
+ int v0 = p.tri[idx * 3 + 0];
+ int v1 = p.tri[idx * 3 + 1];
+ int v2 = p.tri[idx * 3 + 2];
+
+ if (v0 < 0 || v0 >= p.numVertices ||
+ v1 < 0 || v1 >= p.numVertices ||
+ v2 < 0 || v2 >= p.numVertices)
+ return;
+
+ if (v0 == v1 || v1 == v2 || v2 == v0)
+ return;
+
+ evhash_insert_vertex(p, v1, v2, v0);
+ evhash_insert_vertex(p, v2, v0, v1);
+ evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+ // Calculate pixel position.
+ int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+ int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.width || py >= p.height || pz >= p.n)
+ return;
+
+ // Pointer to our TriIdx and fetch.
+ int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+ float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
+
+ // Look right, clamp at edge.
+ int pidx1 = pidx0;
+ if (px < p.width - 1)
+ pidx1 += 4;
+ float tri1 = p.rasterOut[pidx1];
+
+ // Look down, clamp at edge.
+ int pidx2 = pidx0;
+ if (py < p.height - 1)
+ pidx2 += p.width << 2;
+ float tri2 = p.rasterOut[pidx2];
+
+ // Determine amount of work.
+ int count = 0;
+ if (tri1 != tri0) count = 1;
+ if (tri2 != tri0) count += 1;
+ if (!count)
+ return; // Exit warp.
+
+ // Coalesce work counter update to once per CTA.
+ __shared__ int s_temp;
+ s_temp = 0;
+ __syncthreads();
+ int idx = atomicAdd(&s_temp, count);
+ __syncthreads();
+ if (idx == 0)
+ {
+ int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+ s_temp = base + 1; // don't clobber the counters in first slot.
+ }
+ __syncthreads();
+ idx += s_temp;
+
+ // Write to memory.
+ if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+ if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+ __shared__ int s_base;
+ int workCount = p.workBuffer[0].x;
+ for(;;)
+ {
+ // Persistent threads work fetcher.
+ __syncthreads();
+ if (threadIdx.x == 0)
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+ __syncthreads();
+ int thread_idx = s_base + threadIdx.x;
+ if (thread_idx >= workCount)
+ return;
+
+ int4* pItem = p.workBuffer + thread_idx + 1;
+ int4 item = *pItem;
+ int px = item.x;
+ int py = item.y;
+ int pz = (int)(((unsigned int)item.z) >> 16);
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+ int pixel0 = px + p.width * (py + p.height * pz);
+ int pixel1 = pixel0 + (d ? p.width : 1);
+ float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+ float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+ int tri0 = float_to_triidx(zt0.y) - 1;
+ int tri1 = float_to_triidx(zt1.y) - 1;
+
+ // Select triangle based on background / depth.
+ int tri = (tri0 >= 0) ? tri0 : tri1;
+ if (tri0 >= 0 && tri1 >= 0)
+ tri = (zt0.x < zt1.x) ? tri0 : tri1;
+ if (tri == tri1)
+ {
+ // Calculate with respect to neighbor pixel if chose that triangle.
+ px += 1 - d;
+ py += d;
+ }
+
+ // Bail out if triangle index is corrupt.
+ if (tri < 0 || tri >= p.numTriangles)
+ continue;
+
+ // Fetch vertex indices.
+ int vi0 = p.tri[tri * 3 + 0];
+ int vi1 = p.tri[tri * 3 + 1];
+ int vi2 = p.tri[tri * 3 + 2];
+
+ // Bail out if vertex indices are corrupt.
+ if (vi0 < 0 || vi0 >= p.numVertices ||
+ vi1 < 0 || vi1 >= p.numVertices ||
+ vi2 < 0 || vi2 >= p.numVertices)
+ continue;
+
+ // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+ int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+ int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+ int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+ // Instance mode: Adjust vertex indices based on minibatch index.
+ if (p.instance_mode)
+ {
+ int vbase = pz * p.numVertices;
+ vi0 += vbase;
+ vi1 += vbase;
+ vi2 += vbase;
+ if (op0 >= 0) op0 += vbase;
+ if (op1 >= 0) op1 += vbase;
+ if (op2 >= 0) op2 += vbase;
+ }
+
+ // Fetch vertex positions.
+ float4 p0 = ((float4*)p.pos)[vi0];
+ float4 p1 = ((float4*)p.pos)[vi1];
+ float4 p2 = ((float4*)p.pos)[vi2];
+ float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+ float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+ float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+ // Project vertices to pixel space.
+ float w0 = 1.f / p0.w;
+ float w1 = 1.f / p1.w;
+ float w2 = 1.f / p2.w;
+ float ow0 = 1.f / o0.w;
+ float ow1 = 1.f / o1.w;
+ float ow2 = 1.f / o2.w;
+ float fx = (float)px + .5f - p.xh;
+ float fy = (float)py + .5f - p.yh;
+ float x0 = p0.x * w0 * p.xh - fx;
+ float y0 = p0.y * w0 * p.yh - fy;
+ float x1 = p1.x * w1 * p.xh - fx;
+ float y1 = p1.y * w1 * p.yh - fy;
+ float x2 = p2.x * w2 * p.xh - fx;
+ float y2 = p2.y * w2 * p.yh - fy;
+ float ox0 = o0.x * ow0 * p.xh - fx;
+ float oy0 = o0.y * ow0 * p.yh - fy;
+ float ox1 = o1.x * ow1 * p.xh - fx;
+ float oy1 = o1.y * ow1 * p.yh - fy;
+ float ox2 = o2.x * ow2 * p.xh - fx;
+ float oy2 = o2.y * ow2 * p.yh - fy;
+
+ // Signs to kill non-silhouette edges.
+ float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+ float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+ float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+ float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+ // If no matching signs anywhere, skip the rest.
+ if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+ {
+ // XY flip for horizontal edges.
+ if (d)
+ {
+ swap(x0, y0);
+ swap(x1, y1);
+ swap(x2, y2);
+ }
+
+ float dx0 = x2 - x1;
+ float dx1 = x0 - x2;
+ float dx2 = x1 - x0;
+ float dy0 = y2 - y1;
+ float dy1 = y0 - y2;
+ float dy2 = y1 - y0;
+
+ // Check if an edge crosses between us and the neighbor pixel.
+ float dc = -F32_MAX;
+ float ds = (tri == tri0) ? 1.f : -1.f;
+ float d0 = ds * (x1*dy0 - y1*dx0);
+ float d1 = ds * (x2*dy1 - y2*dx1);
+ float d2 = ds * (x0*dy2 - y0*dx2);
+
+ if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+ if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+ if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+ int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+ if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+ if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+ if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+ float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+ // Adjust output image if a suitable edge was found.
+ if (dc > -eps && dc < 1.f + eps)
+ {
+ dc = fminf(fmaxf(dc, 0.f), 1.f);
+ float alpha = ds * (.5f - dc);
+ const float* pColor0 = p.color + pixel0 * p.channels;
+ const float* pColor1 = p.color + pixel1 * p.channels;
+ float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+ for (int i=0; i < p.channels; i++)
+ atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+ // Rewrite the work item's flags and alpha. Keep original px, py.
+ unsigned int flags = pz << 16;
+ flags |= di;
+ flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+ flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+ ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+ // Temporary space for coalesced atomics.
+ CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+ __shared__ int s_base; // Work counter communication across entire CTA.
+
+ int workCount = p.workBuffer[0].x;
+
+ for(;;)
+ {
+ // Persistent threads work fetcher.
+ __syncthreads();
+ if (threadIdx.x == 0)
+ s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+ __syncthreads();
+ int thread_idx = s_base + threadIdx.x;
+ if (thread_idx >= workCount)
+ return;
+
+ // Read work item filled out by forward kernel.
+ int4 item = p.workBuffer[thread_idx + 1];
+ unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+ if (item.w == 0)
+ continue; // No effect.
+
+ // Unpack work item and replicate setup from forward analysis kernel.
+ int px = item.x;
+ int py = item.y;
+ int pz = (int)(((unsigned int)item.z) >> 16);
+ int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+ float alpha = __int_as_float(item.w);
+ int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+ int di = item.z & AAWorkItem::EDGE_MASK;
+ float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+ int pixel0 = px + p.width * (py + p.height * pz);
+ int pixel1 = pixel0 + (d ? p.width : 1);
+ int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
+ if (tri1)
+ {
+ px += 1 - d;
+ py += d;
+ }
+
+ // Bail out if triangle index is corrupt.
+ bool triFail = (tri < 0 || tri >= p.numTriangles);
+ amask = __ballot_sync(amask, !triFail);
+ if (triFail)
+ continue;
+
+ // Outgoing color gradients.
+ float* pGrad0 = p.gradColor + pixel0 * p.channels;
+ float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+ // Incoming color gradients.
+ const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+ // Position gradient weight based on colors and incoming gradients.
+ float dd = 0.f;
+ const float* pColor0 = p.color + pixel0 * p.channels;
+ const float* pColor1 = p.color + pixel1 * p.channels;
+
+ // Loop over channels and accumulate.
+ for (int i=0; i < p.channels; i++)
+ {
+ float dy = pDy[i];
+ if (dy != 0.f)
+ {
+ // Update position gradient weight.
+ dd += dy * (pColor1[i] - pColor0[i]);
+
+ // Update color gradients. No coalescing because all have different targets.
+ float v = alpha * dy;
+ atomicAdd(&pGrad0[i], -v);
+ atomicAdd(&pGrad1[i], v);
+ }
+ }
+
+ // If position weight is zero, skip the rest.
+ bool noGrad = (dd == 0.f);
+ amask = __ballot_sync(amask, !noGrad);
+ if (noGrad)
+ continue;
+
+ // Fetch vertex indices of the active edge and their positions.
+ int i1 = (di < 2) ? (di + 1) : 0;
+ int i2 = (i1 < 2) ? (i1 + 1) : 0;
+ int vi1 = p.tri[3 * tri + i1];
+ int vi2 = p.tri[3 * tri + i2];
+
+ // Bail out if vertex indices are corrupt.
+ bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+ amask = __ballot_sync(amask, !vtxFail);
+ if (vtxFail)
+ continue;
+
+ // Instance mode: Adjust vertex indices based on minibatch index.
+ if (p.instance_mode)
+ {
+ vi1 += pz * p.numVertices;
+ vi2 += pz * p.numVertices;
+ }
+
+ // Fetch vertex positions.
+ float4 p1 = ((float4*)p.pos)[vi1];
+ float4 p2 = ((float4*)p.pos)[vi2];
+
+ // Project vertices to pixel space.
+ float pxh = p.xh;
+ float pyh = p.yh;
+ float fx = (float)px + .5f - pxh;
+ float fy = (float)py + .5f - pyh;
+
+ // XY flip for horizontal edges.
+ if (d)
+ {
+ swap(p1.x, p1.y);
+ swap(p2.x, p2.y);
+ swap(pxh, pyh);
+ swap(fx, fy);
+ }
+
+ // Gradient calculation setup.
+ float w1 = 1.f / p1.w;
+ float w2 = 1.f / p2.w;
+ float x1 = p1.x * w1 * pxh - fx;
+ float y1 = p1.y * w1 * pyh - fy;
+ float x2 = p2.x * w2 * pxh - fx;
+ float y2 = p2.y * w2 * pyh - fy;
+ float dx = x2 - x1;
+ float dy = y2 - y1;
+ float db = x1*dy - y1*dx;
+
+ // Compute inverse delta-y with epsilon.
+ float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+ float iy = 1.f / (dy + ep);
+
+ // Compute position gradients.
+ float dby = db * iy;
+ float iw1 = -w1 * iy * dd;
+ float iw2 = w2 * iy * dd;
+ float gp1x = iw1 * pxh * y2;
+ float gp2x = iw2 * pxh * y1;
+ float gp1y = iw1 * pyh * (dby - x2);
+ float gp2y = iw2 * pyh * (dby - x1);
+ float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+ float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+ // XY flip the gradients.
+ if (d)
+ {
+ swap(gp1x, gp1y);
+ swap(gp2x, gp2y);
+ }
+
+ // Kill position gradients if alpha was saturated.
+ if (fabsf(alpha) >= 0.5f)
+ {
+ gp1x = gp1y = gp1w = 0.f;
+ gp2x = gp2y = gp2w = 0.f;
+ }
+
+ // Initialize coalesced atomics. Match both triangle ID and edge index.
+ // Also note that some threads may be inactive.
+ CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+ // Accumulate gradients.
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+ caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+ }
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.h
new file mode 100644
index 0000000..a324f2f
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/antialias.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH 32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT 8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK 256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK 256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
+#define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc) ((alloc) >= (2 << 25) ? 2 : 3)
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK 256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+ const float* color; // Incoming color buffer.
+ const float* rasterOut; // Incoming rasterizer output buffer.
+ const int* tri; // Incoming triangle buffer.
+ const float* pos; // Incoming position buffer.
+ float* output; // Output buffer of forward kernel.
+ const float* dy; // Incoming gradients.
+ float* gradColor; // Output buffer, color gradient.
+ float* gradPos; // Output buffer, position gradient.
+ int4* workBuffer; // Buffer for storing intermediate work items. First item reserved for counters.
+ uint4* evHash; // Edge-vertex hash.
+ int allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+ int numTriangles; // Number of triangles.
+ int numVertices; // Number of vertices.
+ int width; // Input width.
+ int height; // Input height.
+ int n; // Minibatch size.
+ int channels; // Channel count in color input.
+ float xh, yh; // Transfer to pixel space.
+ int instance_mode; // 0=normal, 1=instance mode.
+ int tri_const; // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.cpp
new file mode 100644
index 0000000..e566c03
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+ int maxThreads = maxWidth * maxHeight;
+ if (maxThreads <= 1 || (width * height) <= 1)
+ return dim3(1, 1, 1); // Degenerate.
+
+ // Start from max size.
+ int bw = maxWidth;
+ int bh = maxHeight;
+
+ // Optimizations for weirdly sized buffers.
+ if (width < bw)
+ {
+ // Decrease block width to smallest power of two that covers the buffer width.
+ while ((bw >> 1) >= width)
+ bw >>= 1;
+
+ // Maximize height.
+ bh = maxThreads / bw;
+ if (bh > height)
+ bh = height;
+ }
+ else if (height < bh)
+ {
+ // Halve height and double width until fits completely inside buffer vertically.
+ while (bh > height)
+ {
+ bh >>= 1;
+ if (bw < width)
+ bw <<= 1;
+ }
+ }
+
+ // Done.
+ return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+ dim3 gridSize;
+ gridSize.x = (width - 1) / blockSize.x + 1;
+ gridSize.y = (height - 1) / blockSize.y + 1;
+ gridSize.z = (depth - 1) / blockSize.z + 1;
+ return gridSize;
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.h
new file mode 100644
index 0000000..01ecf9f
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/common.h
@@ -0,0 +1,263 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include
+#include
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+
+#ifdef __CUDACC__
+
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+
+static __device__ __forceinline__ float2& operator*= (float2& a, const float2& b) { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2& operator+= (float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2& operator-= (float2& a, const float2& b) { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2& operator*= (float2& a, float b) { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2& operator+= (float2& a, float b) { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2& operator-= (float2& a, float b) { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2 operator* (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2 operator+ (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2 operator- (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2 operator* (const float2& a, float b) { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2 operator+ (const float2& a, float b) { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2 operator- (const float2& a, float b) { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2 operator* (float a, const float2& b) { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2 operator+ (float a, const float2& b) { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2 operator- (float a, const float2& b) { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2 operator- (const float2& a) { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3& operator*= (float3& a, const float3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3& operator+= (float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3& operator-= (float3& a, const float3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3& operator*= (float3& a, float b) { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3& operator+= (float3& a, float b) { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3& operator-= (float3& a, float b) { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3 operator* (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3 operator+ (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3 operator- (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3 operator* (const float3& a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3 operator+ (const float3& a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3 operator- (const float3& a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3 operator* (float a, const float3& b) { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3 operator+ (float a, const float3& b) { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3 operator- (float a, const float3& b) { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3 operator- (const float3& a) { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4& operator*= (float4& a, const float4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4& operator+= (float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4& operator-= (float4& a, const float4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4& operator*= (float4& a, float b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4& operator+= (float4& a, float b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4& operator-= (float4& a, float b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4 operator* (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4 operator+ (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4 operator- (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4 operator* (const float4& a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4 operator+ (const float4& a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4 operator- (const float4& a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4 operator* (float a, const float4& b) { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4 operator+ (float a, const float4& b) { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4 operator- (float a, const float4& b) { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4 operator- (const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2& operator*= (int2& a, const int2& b) { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2& operator+= (int2& a, const int2& b) { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2& operator-= (int2& a, const int2& b) { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2& operator*= (int2& a, int b) { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2& operator+= (int2& a, int b) { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2& operator-= (int2& a, int b) { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2 operator* (const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2 operator+ (const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2 operator- (const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2 operator* (const int2& a, int b) { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2 operator+ (const int2& a, int b) { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2 operator- (const int2& a, int b) { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2 operator* (int a, const int2& b) { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2 operator+ (int a, const int2& b) { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2 operator- (int a, const int2& b) { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2 operator- (const int2& a) { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3& operator*= (int3& a, const int3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3& operator+= (int3& a, const int3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3& operator-= (int3& a, const int3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3& operator*= (int3& a, int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3& operator+= (int3& a, int b) { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3& operator-= (int3& a, int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3 operator* (const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3 operator+ (const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3 operator- (const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3 operator* (const int3& a, int b) { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3 operator+ (const int3& a, int b) { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3 operator- (const int3& a, int b) { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3 operator* (int a, const int3& b) { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3 operator+ (int a, const int3& b) { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3 operator- (int a, const int3& b) { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3 operator- (const int3& a) { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4& operator*= (int4& a, const int4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4& operator+= (int4& a, const int4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4& operator-= (int4& a, const int4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4& operator*= (int4& a, int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4& operator+= (int4& a, int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4& operator-= (int4& a, int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4 operator* (const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4 operator+ (const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4 operator- (const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4 operator* (const int4& a, int b) { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4 operator+ (const int4& a, int b) { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4 operator- (const int4& a, int b) { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4 operator* (int a, const int4& b) { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4 operator+ (int a, const int4& b) { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4 operator- (int a, const int4& b) { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4 operator- (const int4& a) { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2& operator*= (uint2& a, const uint2& b) { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2& operator+= (uint2& a, const uint2& b) { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2& operator-= (uint2& a, const uint2& b) { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2& operator*= (uint2& a, unsigned int b) { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2& operator+= (uint2& a, unsigned int b) { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2& operator-= (uint2& a, unsigned int b) { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2 operator* (const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2 operator+ (const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2 operator- (const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2 operator* (const uint2& a, unsigned int b) { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2 operator+ (const uint2& a, unsigned int b) { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2 operator- (const uint2& a, unsigned int b) { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2 operator* (unsigned int a, const uint2& b) { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2 operator+ (unsigned int a, const uint2& b) { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2 operator- (unsigned int a, const uint2& b) { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3& operator*= (uint3& a, const uint3& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3& operator+= (uint3& a, const uint3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3& operator-= (uint3& a, const uint3& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3& operator*= (uint3& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3& operator+= (uint3& a, unsigned int b) { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3& operator-= (uint3& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3 operator* (const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3 operator+ (const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3 operator- (const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3 operator* (const uint3& a, unsigned int b) { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3 operator+ (const uint3& a, unsigned int b) { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3 operator- (const uint3& a, unsigned int b) { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3 operator* (unsigned int a, const uint3& b) { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3 operator+ (unsigned int a, const uint3& b) { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3 operator- (unsigned int a, const uint3& b) { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4& operator*= (uint4& a, const uint4& b) { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4& operator+= (uint4& a, const uint4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4& operator-= (uint4& a, const uint4& b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4& operator*= (uint4& a, unsigned int b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4& operator+= (uint4& a, unsigned int b) { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4& operator-= (uint4& a, unsigned int b) { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4 operator* (const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4 operator+ (const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4 operator- (const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4 operator* (const uint4& a, unsigned int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4 operator+ (const uint4& a, unsigned int b) { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4 operator- (const uint4& a, unsigned int b) { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4 operator* (unsigned int a, const uint4& b) { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4 operator+ (unsigned int a, const uint4& b) { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4 operator- (unsigned int a, const uint4& b) { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+
+template static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float zero_value (void) { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value(void) { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value(void) { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b) { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b) { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b) { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b) { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b) { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b) { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b) { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b) { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b) { return make_uint4(a.x, a.y, b.x, b.y); }
+
+template static __device__ __forceinline__ void swap(T& a, T& b) { T temp = a; a = b; b = temp; }
+
+//------------------------------------------------------------------------
+// Triangle ID <-> float32 conversion functions to support very large triangle IDs.
+//
+// Values up to and including 16777216 (also, negative values) are converted trivially and retain
+// compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
+// the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
+
+static __device__ __forceinline__ int float_to_triidx(float x) { if (x <= 16777216.f) return (int)x; return __float_as_int(x) - 0x4a800000; }
+static __device__ __forceinline__ float triidx_to_float(int x) { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
+
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+ __shared__ float CA_TEMP[(threads_per_block)]
+
+#define CA_SET_GROUP_MASK(group, thread_mask) \
+ bool _ca_leader; \
+ float* _ca_ptr; \
+ do { \
+ int tidx = threadIdx.x + blockDim.x * threadIdx.y; \
+ int lane = tidx & 31; \
+ int warp = tidx >> 5; \
+ int tmask = __match_any_sync((thread_mask), (group)); \
+ int leader = __ffs(tmask) - 1; \
+ _ca_leader = (leader == lane); \
+ _ca_ptr = &_ca_temp[((warp << 5) + leader)]; \
+ } while(0)
+
+#define CA_SET_GROUP(group) \
+ CA_SET_GROUP_MASK((group), 0xffffffffu)
+
+#define caAtomicAdd(ptr, value) \
+ do { \
+ if (_ca_leader) \
+ *_ca_ptr = 0.f; \
+ atomicAdd(_ca_ptr, (value)); \
+ if (_ca_leader) \
+ atomicAdd((ptr), *_ca_ptr); \
+ } while(0)
+
+#define caAtomicAdd3_xyw(ptr, x, y, w) \
+ do { \
+ caAtomicAdd((ptr), (x)); \
+ caAtomicAdd((ptr)+1, (y)); \
+ caAtomicAdd((ptr)+3, (w)); \
+ } while(0)
+
+#define caAtomicAddTexture(ptr, level, idx, value) \
+ do { \
+ CA_SET_GROUP((idx) ^ ((level) << 27)); \
+ caAtomicAdd((ptr)+(idx), (value)); \
+ } while(0)
+
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w) \
+ do { \
+ atomicAdd((ptr), (x)); \
+ atomicAdd((ptr)+1, (y)); \
+ atomicAdd((ptr)+3, (w)); \
+ } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
+//------------------------------------------------------------------------
+#endif // __CUDACC__
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp
new file mode 100644
index 0000000..3c1c3a7
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// This is a slimmed-down and modernized version of the original
+// CudaRaster codebase that accompanied the HPG 2011 paper
+// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
+// Modifications have been made to accommodate post-Volta execution model
+// with warp divergence. Support for shading, blending, quad rendering,
+// and supersampling have been removed as unnecessary for nvdiffrast.
+//------------------------------------------------------------------------
+
+namespace CR
+{
+
+class RasterImpl;
+
+//------------------------------------------------------------------------
+// Interface class to isolate user from implementation details.
+//------------------------------------------------------------------------
+
+class CudaRaster
+{
+public:
+ enum
+ {
+ RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
+ RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
+ };
+
+public:
+ CudaRaster (void);
+ ~CudaRaster (void);
+
+ void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
+ void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
+ void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+ void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
+ void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+ void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+ bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+ void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
+ void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
+ void swapDepthAndPeel (void); // Swap depth and peeling buffers.
+
+private:
+ CudaRaster (const CudaRaster&); // forbidden
+ CudaRaster& operator= (const CudaRaster&); // forbidden
+
+private:
+ RasterImpl* m_impl; // Opaque pointer to implementation.
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl
new file mode 100644
index 0000000..deae9d2
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl
@@ -0,0 +1,423 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void binRasterImpl(const CRParams p)
+{
+ __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
+ __shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
+ __shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
+ __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
+ __shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+ __shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+ __shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
+ __shared__ volatile U32 s_batchPos;
+ __shared__ volatile U32 s_bufCount;
+ __shared__ volatile U32 s_overTotal;
+ __shared__ volatile U32 s_allocBase;
+
+ const CRImageParams& ip = getImageParams(p, blockIdx.z);
+ CRAtomics& atomics = p.atomics[blockIdx.z];
+ const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+
+ S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+ S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+ S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+ S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
+ S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+
+ if (atomics.numSubtris > p.maxSubtris)
+ return;
+
+ // per-thread state
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
+ int batchPos = 0;
+
+ // first 16 elements of s_broadcast are always zero
+ if (thrInBlock < 16)
+ s_broadcast[thrInBlock] = 0;
+
+ // initialize output linked lists and offsets
+ if (thrInBlock < p.numBins)
+ {
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
+ s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
+ s_outTotal[thrInBlock] = 0;
+ }
+
+ // repeat until done
+ for(;;)
+ {
+ // get batch
+ if (thrInBlock == 0)
+ s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
+ __syncthreads();
+ batchPos = s_batchPos;
+
+ // all batches done?
+ if (batchPos >= ip.triCount)
+ break;
+
+ // per-thread state
+ int bufIndex = 0;
+ int bufCount = 0;
+ int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
+
+ // loop over batch as long as we have triangles in it
+ do
+ {
+ // read more triangles
+ while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
+ {
+ // get subtriangle count
+
+ int triIdx = batchPos + thrInBlock;
+ int num = 0;
+ if (triIdx < batchEnd)
+ num = triSubtris[triIdx];
+
+ // cumulative sum of subtriangles within each warp
+ U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
+ if (__any_sync(~0u, num > 1))
+ {
+ myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
+ myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
+ }
+ if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
+ s_broadcast[threadIdx.y + 16] = myIdx + num;
+ __syncthreads();
+
+ // cumulative sum of per-warp subtriangle counts
+ // Note: cannot have more than 32 warps or this needs to sync between each step.
+ bool act = (thrInBlock < CR_BIN_WARPS);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (threadIdx.y == 0 && act)
+ {
+ volatile U32* ptr = &s_broadcast[thrInBlock + 16];
+ U32 val = *ptr;
+ #if (CR_BIN_WARPS > 1)
+ val += ptr[-1]; __syncwarp(actMask);
+ *ptr = val; __syncwarp(actMask);
+ #endif
+ #if (CR_BIN_WARPS > 2)
+ val += ptr[-2]; __syncwarp(actMask);
+ *ptr = val; __syncwarp(actMask);
+ #endif
+ #if (CR_BIN_WARPS > 4)
+ val += ptr[-4]; __syncwarp(actMask);
+ *ptr = val; __syncwarp(actMask);
+ #endif
+ #if (CR_BIN_WARPS > 8)
+ val += ptr[-8]; __syncwarp(actMask);
+ *ptr = val; __syncwarp(actMask);
+ #endif
+ #if (CR_BIN_WARPS > 16)
+ val += ptr[-16]; __syncwarp(actMask);
+ *ptr = val; __syncwarp(actMask);
+ #endif
+
+ // initially assume that we consume everything
+ // only last active thread does the writes
+ if (threadIdx.x == CR_BIN_WARPS - 1)
+ {
+ s_batchPos = batchPos + CR_BIN_WARPS * 32;
+ s_bufCount = bufCount + val;
+ }
+ }
+ __syncthreads();
+
+ // skip if no subtriangles
+ if (num)
+ {
+ // calculate write position for first subtriangle
+ U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
+
+ // only write if entire triangle fits
+ if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
+ {
+ pos += bufIndex; // adjust for current start position
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+ if (num == 1)
+ s_triBuf[pos] = triIdx * 8 + 7; // single triangle
+ else
+ {
+ for (int i=0; i < num; i++)
+ {
+ s_triBuf[pos] = triIdx * 8 + i;
+ pos++;
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+ }
+ }
+ } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
+ {
+ // this triangle is the first that failed, overwrite total count and triangle count
+ s_batchPos = batchPos + thrInBlock;
+ s_bufCount = pos;
+ }
+ }
+
+ // update triangle counts
+ __syncthreads();
+ batchPos = s_batchPos;
+ bufCount = s_bufCount;
+ }
+
+ // make every warp clear its output buffers
+ for (int i=threadIdx.x; i < p.numBins; i += 32)
+ s_outMask[threadIdx.y][i] = 0;
+ __syncwarp();
+
+ // choose our triangle
+ uint4 triData = make_uint4(0, 0, 0, 0);
+ if (thrInBlock < bufCount)
+ {
+ U32 triPos = bufIndex + thrInBlock;
+ triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
+
+ // find triangle
+ int triIdx = s_triBuf[triPos];
+ int dataIdx = triIdx >> 3;
+ int subtriIdx = triIdx & 7;
+ if (subtriIdx != 7)
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
+
+ // read triangle
+
+ triData = *(((const uint4*)triHeader) + dataIdx);
+ }
+
+ // setup bounding box and edge functions, and rasterize
+ S32 lox, loy, hix, hiy;
+ bool hasTri = (thrInBlock < bufCount);
+ U32 hasTriMask = __ballot_sync(~0u, hasTri);
+ if (hasTri)
+ {
+ S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
+ S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+ int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+ lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
+ loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+ hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
+ hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+
+ U32 bit = 1 << threadIdx.x;
+#if __CUDA_ARCH__ >= 700
+ bool multi = (hix != lox || hiy != loy);
+ if (!__any_sync(hasTriMask, multi))
+ {
+ int binIdx = lox + p.widthBins * loy;
+ U32 mask = __match_any_sync(hasTriMask, binIdx);
+ s_outMask[threadIdx.y][binIdx] = mask;
+ __syncwarp(hasTriMask);
+ } else
+#endif
+ {
+ bool complex = (hix > lox+1 || hiy > loy+1);
+ if (!__any_sync(hasTriMask, complex))
+ {
+ int binIdx = lox + p.widthBins * loy;
+ atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
+ if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
+ if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
+ if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
+ } else
+ {
+ S32 d12x = d02x - d01x, d12y = d02y - d01y;
+ v0x -= lox << binLog, v0y -= loy << binLog;
+
+ S32 t01 = v0x * d01y - v0y * d01x;
+ S32 t02 = v0y * d02x - v0x * d02y;
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+ S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
+ S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
+ S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
+
+ int width = hix - lox + 1;
+ d01x += width * d01y;
+ d02x += width * d02y;
+ d12x += width * d12y;
+
+ U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
+ U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
+ U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
+ int stride = p.widthBins * 4;
+ int ptrYInc = stride - width * 4;
+
+ do
+ {
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+ atomicOr((U32*)currPtr, bit);
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+ if (currPtr == skipPtr)
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
+ }
+ while (currPtr != endPtr);
+ }
+ }
+ }
+
+ // count per-bin contributions
+ if (thrInBlock == 0)
+ s_overTotal = 0; // overflow counter
+
+ // ensure that out masks are done
+ __syncthreads();
+
+ int overIndex = -1;
+ bool act = (thrInBlock < p.numBins);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
+ U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
+ int total = 0;
+ for (int i = 0; i < CR_BIN_WARPS; i++)
+ {
+ total += __popc(*(U32*)srcPtr);
+ *(U32*)dstPtr = total;
+ srcPtr += (CR_MAXBINS_SQR + 1) * 4;
+ dstPtr += (CR_MAXBINS_SQR + 1) * 4;
+ }
+
+ // overflow => request a new segment
+ int ofs = s_outOfs[thrInBlock];
+ bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
+ U32 ovrMask = __ballot_sync(actMask, ovr);
+ if (ovr)
+ {
+ overIndex = __popc(ovrMask & getLaneMaskLt());
+ if (overIndex == 0)
+ s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
+ __syncwarp(ovrMask);
+ overIndex += s_broadcast[threadIdx.y + 16];
+ s_overIndex[thrInBlock] = overIndex;
+ }
+ }
+
+ // sync after overTotal is ready
+ __syncthreads();
+
+ // at least one segment overflowed => allocate segments
+ U32 overTotal = s_overTotal;
+ U32 allocBase = 0;
+ if (overTotal > 0)
+ {
+ // allocate memory
+ if (thrInBlock == 0)
+ {
+ U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
+ s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
+ }
+ __syncthreads();
+ allocBase = s_allocBase;
+
+ // did my bin overflow?
+ if (overIndex != -1)
+ {
+ // calculate new segment index
+ int segIdx = allocBase + overIndex;
+
+ // add to linked list
+ if (s_outOfs[thrInBlock] < 0)
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
+ else
+ binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
+
+ // defaults
+ binSegNext [segIdx] = -1;
+ binSegCount[segIdx] = CR_BIN_SEG_SIZE;
+ }
+ }
+
+ // concurrent emission -- each warp handles its own triangle
+ if (thrInBlock < bufCount)
+ {
+ int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
+ int currBin = lox + loy * p.widthBins;
+ int skipBin = (hix + 1) + loy * p.widthBins;
+ int endBin = lox + (hiy + 1) * p.widthBins;
+ int binYInc = p.widthBins - (hix - lox + 1);
+
+ // loop over triangle's bins
+ do
+ {
+ U32 outMask = s_outMask[threadIdx.y][currBin];
+ if (outMask & (1< 0)
+ idx += s_outCount[threadIdx.y-1][currBin];
+
+ int base = s_outOfs[currBin];
+ int free = (-base) & (CR_BIN_SEG_SIZE - 1);
+ if (idx >= free)
+ idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
+ else
+ idx += base;
+
+ binSegData[idx] = s_triBuf[triPos];
+ }
+
+ currBin++;
+ if (currBin == skipBin)
+ currBin += binYInc, skipBin += p.widthBins;
+ }
+ while (currBin != endBin);
+ }
+
+ // wait all triangles to finish, then replace overflown segment offsets
+ __syncthreads();
+ if (thrInBlock < p.numBins)
+ {
+ U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
+ U32 oldOfs = s_outOfs[thrInBlock];
+ if (overIndex == -1)
+ s_outOfs[thrInBlock] = oldOfs + total;
+ else
+ {
+ int addr = oldOfs + total;
+ addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
+ addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
+ s_outOfs[thrInBlock] = addr;
+ }
+ s_outTotal[thrInBlock] += total;
+ }
+
+ // these triangles are now done
+ int count = ::min(bufCount, CR_BIN_WARPS * 32);
+ bufCount -= count;
+ bufIndex += count;
+ bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
+ }
+ while (bufCount > 0 || batchPos < batchEnd);
+
+ // flush all bins
+ if (thrInBlock < p.numBins)
+ {
+ int ofs = s_outOfs[thrInBlock];
+ if (ofs & (CR_BIN_SEG_SIZE-1))
+ {
+ int seg = ofs >> CR_BIN_SEG_LOG2;
+ binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
+ s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
+ }
+ }
+ }
+
+ // output totals
+ if (thrInBlock < p.numBins)
+ binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp
new file mode 100644
index 0000000..b2cd7b9
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "Buffer.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// GPU buffer.
+//------------------------------------------------------------------------
+
+Buffer::Buffer(void)
+: m_gpuPtr(NULL),
+ m_bytes (0)
+{
+ // empty
+}
+
+Buffer::~Buffer(void)
+{
+ if (m_gpuPtr)
+ cudaFree(m_gpuPtr); // Don't throw an exception.
+}
+
+void Buffer::reset(size_t bytes)
+{
+ if (bytes == m_bytes)
+ return;
+
+ if (m_gpuPtr)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+ m_gpuPtr = NULL;
+ }
+
+ if (bytes > 0)
+ NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
+
+ m_bytes = bytes;
+}
+
+void Buffer::grow(size_t bytes)
+{
+ if (bytes > m_bytes)
+ reset(bytes);
+}
+
+//------------------------------------------------------------------------
+// Host buffer with page-locked memory.
+//------------------------------------------------------------------------
+
+HostBuffer::HostBuffer(void)
+: m_hostPtr(NULL),
+ m_bytes (0)
+{
+ // empty
+}
+
+HostBuffer::~HostBuffer(void)
+{
+ if (m_hostPtr)
+ cudaFreeHost(m_hostPtr); // Don't throw an exception.
+}
+
+void HostBuffer::reset(size_t bytes)
+{
+ if (bytes == m_bytes)
+ return;
+
+ if (m_hostPtr)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
+ m_hostPtr = NULL;
+ }
+
+ if (bytes > 0)
+ NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
+
+ m_bytes = bytes;
+}
+
+void HostBuffer::grow(size_t bytes)
+{
+ if (bytes > m_bytes)
+ reset(bytes);
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp
new file mode 100644
index 0000000..8a4b38f
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class Buffer
+{
+public:
+ Buffer (void);
+ ~Buffer (void);
+
+ void reset (size_t bytes);
+ void grow (size_t bytes);
+ void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
+ size_t getSize (void) const { return m_bytes; }
+
+ void setPtr (void* ptr) { m_gpuPtr = ptr; }
+
+private:
+ void* m_gpuPtr;
+ size_t m_bytes;
+};
+
+//------------------------------------------------------------------------
+
+class HostBuffer
+{
+public:
+ HostBuffer (void);
+ ~HostBuffer (void);
+
+ void reset (size_t bytes);
+ void grow (size_t bytes);
+ void* getPtr (void) { return m_hostPtr; }
+ size_t getSize (void) const { return m_bytes; }
+
+ void setPtr (void* ptr) { m_hostPtr = ptr; }
+
+private:
+ void* m_hostPtr;
+ size_t m_bytes;
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
new file mode 100644
index 0000000..a7081c7
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
@@ -0,0 +1,730 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
+{
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
+ int tileY = tileInBin >> CR_BIN_LOG2;
+ return tileX + tileY * widthTiles;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void coarseRasterImpl(const CRParams p)
+{
+ // Common.
+
+ __shared__ volatile U32 s_workCounter;
+ __shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
+
+ // Input.
+
+ __shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
+ __shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
+ __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
+ __shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
+ __shared__ volatile S32 s_triQueueWritePos;
+ __shared__ volatile U32 s_binStreamSelectedOfs;
+ __shared__ volatile U32 s_binStreamSelectedSize;
+
+ // Output.
+
+ __shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
+ __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
+ __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
+ __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
+ __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
+ __shared__ volatile U32 s_firstAllocSeg;
+ __shared__ volatile U32 s_firstActiveIdx;
+
+ // Pointers and constants.
+
+ CRAtomics& atomics = p.atomics[blockIdx.z];
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+ const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+ const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+ const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+ const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
+ const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+ S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
+ S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+ S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+ S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
+ S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+ int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
+ int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
+
+ if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
+ return;
+
+ // Initialize sharedmem arrays.
+
+ if (thrInBlock == 0)
+ {
+ s_tileEmitPrefixSum[0] = 0;
+ s_tileAllocPrefixSum[0] = 0;
+ }
+ s_scanTemp[threadIdx.y][threadIdx.x] = 0;
+
+ // Sort bins in descending order of triangle count.
+
+ for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
+ {
+ int count = 0;
+ for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
+ count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
+ s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
+ }
+
+ __syncthreads();
+ sortShared(s_binOrder, p.numBins);
+
+ // Process each bin by one block.
+
+ for (;;)
+ {
+ // Pick a bin for the block.
+
+ if (thrInBlock == 0)
+ s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
+ __syncthreads();
+
+ int workCounter = s_workCounter;
+ if (workCounter >= p.numBins)
+ break;
+
+ U32 binOrder = s_binOrder[workCounter];
+ bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
+ if (binEmpty && !p.deferredClear)
+ break;
+
+ int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
+
+ // Initialize input/output streams.
+
+ int triQueueWritePos = 0;
+ int triQueueReadPos = 0;
+
+ if (thrInBlock < CR_BIN_STREAMS_SIZE)
+ {
+ int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
+ s_binStreamCurrSeg[thrInBlock] = segIdx;
+ s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
+ }
+
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+ s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
+
+ // Initialize per-bin state.
+
+ int binY = idiv_fast(binIdx, p.widthBins);
+ int binX = binIdx - binY * p.widthBins;
+ int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+ int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+ int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+ int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+ int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
+
+ // Entire block: Merge input streams and process triangles.
+
+ if (!binEmpty)
+ do
+ {
+ //------------------------------------------------------------------------
+ // Merge.
+ //------------------------------------------------------------------------
+
+ // Entire block: Not enough triangles => merge and queue segments.
+ // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
+
+ while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
+ {
+ // First warp: Choose the segment with the lowest initial triangle index.
+
+ bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
+ U32 hasStreamMask = __ballot_sync(~0u, hasStream);
+ if (hasStream)
+ {
+ // Find the stream with the lowest triangle index.
+
+ U32 firstTri = s_binStreamFirstTri[thrInBlock];
+ U32 t = firstTri;
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+
+ #if (CR_BIN_STREAMS_SIZE > 1)
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
+ #endif
+ #if (CR_BIN_STREAMS_SIZE > 2)
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
+ #endif
+ #if (CR_BIN_STREAMS_SIZE > 4)
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
+ #endif
+ #if (CR_BIN_STREAMS_SIZE > 8)
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
+ #endif
+ #if (CR_BIN_STREAMS_SIZE > 16)
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
+ #endif
+ v[0] = t; __syncwarp(hasStreamMask);
+
+ // Consume and broadcast.
+
+ bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
+ U32 firstMask = __ballot_sync(hasStreamMask, first);
+ if (first && (firstMask >> threadIdx.x) == 1u)
+ {
+ int segIdx = s_binStreamCurrSeg[thrInBlock];
+ s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
+ if (segIdx != -1)
+ {
+ int segSize = binSegCount[segIdx];
+ int segNext = binSegNext[segIdx];
+ s_binStreamSelectedSize = segSize;
+ s_triQueueWritePos = triQueueWritePos + segSize;
+ s_binStreamCurrSeg[thrInBlock] = segNext;
+ s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
+ }
+ }
+ }
+
+ // No more segments => break.
+
+ __syncthreads();
+ triQueueWritePos = s_triQueueWritePos;
+ int segOfs = s_binStreamSelectedOfs;
+ if (segOfs < 0)
+ break;
+
+ int segSize = s_binStreamSelectedSize;
+ __syncthreads();
+
+ // Fetch triangles into the queue.
+
+ for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
+ {
+ S32 triIdx = binSegData[segOfs + idxInSeg];
+ s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
+ }
+ }
+
+ // All threads: Clear emit masks.
+
+ for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
+ s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
+
+ __syncthreads();
+
+ //------------------------------------------------------------------------
+ // Raster.
+ //------------------------------------------------------------------------
+
+ // Triangle per thread: Read from the queue.
+
+ int triIdx = -1;
+ if (triQueueReadPos + thrInBlock < triQueueWritePos)
+ triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+ uint4 triData = make_uint4(0, 0, 0, 0);
+ if (triIdx != -1)
+ {
+ int dataIdx = triIdx >> 3;
+ int subtriIdx = triIdx & 7;
+ if (subtriIdx != 7)
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
+ triData = *((uint4*)triHeader + dataIdx);
+ }
+
+ // 32 triangles per warp: Record emits (= tile intersections).
+
+ if (__any_sync(~0u, triIdx != -1))
+ {
+ S32 v0x = sub_s16lo_s16lo(triData.x, originX);
+ S32 v0y = sub_s16hi_s16lo(triData.x, originY);
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+
+ // Compute tile-based AABB.
+
+ int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+ int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+ int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+ int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+ int sizex = add_sub(hix, 1, lox);
+ int sizey = add_sub(hiy, 1, loy);
+ int area = sizex * sizey;
+
+ // Miscellaneous init.
+
+ U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
+ int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
+ U32 maskBit = 1 << threadIdx.x;
+
+ // Case A: All AABBs are small => record the full AABB using atomics.
+
+ if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
+ {
+ if (triIdx != -1)
+ {
+ atomicOr((U32*)currPtr, maskBit);
+ if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
+ if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
+ if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
+ }
+ }
+ else
+ {
+ // Compute warp-AABB (scan-32).
+
+ U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
+ if (triIdx == -1)
+ aabbMask = 0;
+
+ volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
+ v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
+
+ U32 maskX = aabbMask & 0xFFFF;
+ U32 maskY = aabbMask >> 16;
+ int wlox = findLeadingOne(maskX ^ (maskX - 1));
+ int wloy = findLeadingOne(maskY ^ (maskY - 1));
+ int whix = findLeadingOne(maskX);
+ int whiy = findLeadingOne(maskY);
+ int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
+
+ // Initialize edge functions.
+
+ S32 d12x = d02x - d01x;
+ S32 d12y = d02y - d01y;
+ v0x -= lox << tileLog;
+ v0y -= loy << tileLog;
+
+ S32 t01 = v0x * d01y - v0y * d01x;
+ S32 t02 = v0y * d02x - v0x * d02y;
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+ S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
+ S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
+ S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
+
+ d01x += sizex * d01y;
+ d02x += sizex * d02y;
+ d12x += sizex * d12y;
+
+ // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
+ if (__any_sync(~0u, warea * 4 <= area * 8))
+ {
+ // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
+ bool act = (triIdx != -1);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ for (int y = wloy; y <= whiy; y++)
+ {
+ bool yIn = (y >= loy && y <= hiy);
+ U32 yMask = __ballot_sync(actMask, yIn);
+ if (yIn)
+ {
+ for (int x = wlox; x <= whix; x++)
+ {
+ bool xyIn = (x >= lox && x <= hix);
+ U32 xyMask = __ballot_sync(yMask, xyIn);
+ if (xyIn)
+ {
+ U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
+ if (threadIdx.x == 31 - __clz(xyMask))
+ *(U32*)currPtr = res;
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+ }
+ }
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
+ }
+ }
+ }
+ }
+
+ // Case C: General case => Check tiles in AABB, record using atomics.
+
+ else
+ {
+ if (triIdx != -1)
+ {
+ U8* skipPtr = currPtr + (sizex << 2);
+ U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
+ do
+ {
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+ atomicOr((U32*)currPtr, maskBit);
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+ if (currPtr == skipPtr)
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
+ }
+ while (currPtr != endPtr);
+ }
+ }
+ }
+ }
+
+ __syncthreads();
+
+ //------------------------------------------------------------------------
+ // Count.
+ //------------------------------------------------------------------------
+
+ // Tile per thread: Initialize prefix sums.
+
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+ {
+ int tileInBin = tileInBin_base + thrInBlock;
+ bool act = (tileInBin < CR_BIN_SQR);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ // Compute prefix sum of emits over warps.
+
+ U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
+ U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
+ int tileEmits = 0;
+ for (int i = 0; i < CR_COARSE_WARPS; i++)
+ {
+ tileEmits += __popc(*(U32*)srcPtr);
+ *(U32*)dstPtr = tileEmits;
+ srcPtr += (CR_BIN_SQR + 1) * 4;
+ dstPtr += (CR_BIN_SQR + 1) * 4;
+ }
+
+ // Determine the number of segments to allocate.
+
+ int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
+ int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
+ volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
+
+ // All counters within the warp are small => compute prefix sum using ballot.
+
+ if (!__any_sync(actMask, tileEmits >= 2))
+ {
+ U32 m = getLaneMaskLe();
+ *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
+ }
+
+ // Otherwise => scan-32 within the warp.
+
+ else
+ {
+ U32 sum = (tileEmits << emitShift) | tileAllocs;
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
+ *v = sum; __syncwarp(actMask);
+ }
+ }
+ }
+
+ // First warp: Scan-8.
+
+ __syncthreads();
+
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
+ if (scan8)
+ {
+ int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+ v[0] = sum; __syncwarp(scan8Mask);
+ #if (CR_BIN_SQR > 1 * 32)
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+ #if (CR_BIN_SQR > 2 * 32)
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+ #if (CR_BIN_SQR > 4 * 32)
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+ }
+
+ __syncthreads();
+
+ // Tile per thread: Finalize prefix sums.
+ // Single thread: Allocate segments.
+
+ for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+ {
+ int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
+ int numEmits = sum >> emitShift;
+ int numAllocs = sum & ((1 << emitShift) - 1);
+ s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
+ s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
+
+ if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
+ {
+ int t = atomicAdd(&atomics.numTileSegs, numAllocs);
+ s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
+ }
+ }
+
+ __syncthreads();
+ int firstAllocSeg = s_firstAllocSeg;
+ int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
+ int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
+
+ //------------------------------------------------------------------------
+ // Emit.
+ //------------------------------------------------------------------------
+
+ // Emit per thread: Write triangle index to globalmem.
+
+ for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
+ {
+ // Find tile in bin.
+
+ U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
+ U8* tilePtr = tileBase;
+ U8* ptr;
+
+ #if (CR_BIN_SQR > 128)
+ ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 64)
+ ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 32)
+ ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 16)
+ ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 8)
+ ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 4)
+ ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 2)
+ ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+ #if (CR_BIN_SQR > 1)
+ ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+ #endif
+
+ int tileInBin = (tilePtr - tileBase) >> 2;
+ int emitInTile = emitInBin - *(U32*)tilePtr;
+
+ // Find warp in tile.
+
+ int warpStep = (CR_BIN_SQR + 1) * 4;
+ U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
+ U8* warpPtr = warpBase;
+
+ #if (CR_COARSE_WARPS > 8)
+ ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+ #endif
+ #if (CR_COARSE_WARPS > 4)
+ ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+ #endif
+ #if (CR_COARSE_WARPS > 2)
+ ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+ #endif
+ #if (CR_COARSE_WARPS > 1)
+ ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+ #endif
+
+ int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
+ U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
+ int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
+
+ // Find thread in warp.
+
+ int threadInWarp = 0;
+ int pop = __popc(emitMask & 0xFFFF);
+ bool pred = (emitInWarp >= pop);
+ if (pred) emitInWarp -= pop;
+ if (pred) emitMask >>= 0x10;
+ if (pred) threadInWarp += 0x10;
+
+ pop = __popc(emitMask & 0xFF);
+ pred = (emitInWarp >= pop);
+ if (pred) emitInWarp -= pop;
+ if (pred) emitMask >>= 0x08;
+ if (pred) threadInWarp += 0x08;
+
+ pop = __popc(emitMask & 0xF);
+ pred = (emitInWarp >= pop);
+ if (pred) emitInWarp -= pop;
+ if (pred) emitMask >>= 0x04;
+ if (pred) threadInWarp += 0x04;
+
+ pop = __popc(emitMask & 0x3);
+ pred = (emitInWarp >= pop);
+ if (pred) emitInWarp -= pop;
+ if (pred) emitMask >>= 0x02;
+ if (pred) threadInWarp += 0x02;
+
+ if (emitInWarp >= (emitMask & 1))
+ threadInWarp++;
+
+ // Figure out where to write.
+
+ int currOfs = s_tileStreamCurrOfs[tileInBin];
+ int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
+ int outOfs = emitInTile;
+
+ if (outOfs < spaceLeft)
+ outOfs += currOfs;
+ else
+ {
+ int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
+ outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
+ }
+
+ // Write.
+
+ int queueIdx = warpInTile * 32 + threadInWarp;
+ int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+ tileSegData[outOfs] = triIdx;
+ }
+
+ //------------------------------------------------------------------------
+ // Patch.
+ //------------------------------------------------------------------------
+
+ // Allocated segment per thread: Initialize next-pointer and count.
+
+ for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
+ {
+ int segIdx = firstAllocSeg + i;
+ tileSegNext[segIdx] = segIdx + 1;
+ tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
+ }
+
+ // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
+
+ __syncthreads();
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+ {
+ int oldOfs = s_tileStreamCurrOfs[tileInBin];
+ int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
+ int allocLo = s_tileAllocPrefixSum[tileInBin];
+ int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
+
+ if (allocLo != allocHi)
+ {
+ S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
+ if (oldOfs < 0)
+ nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
+ *nextPtr = firstAllocSeg + allocLo;
+
+ newOfs--;
+ newOfs &= CR_TILE_SEG_SIZE - 1;
+ newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
+ newOfs++;
+ }
+ s_tileStreamCurrOfs[tileInBin] = newOfs;
+ }
+
+ // Advance queue read pointer.
+ // Queue became empty => bin done.
+
+ triQueueReadPos += CR_COARSE_WARPS * 32;
+ }
+ while (triQueueReadPos < triQueueWritePos);
+
+ // Tile per thread: Fix next-pointer and count of the last segment.
+ // 32 tiles per warp: Count active tiles.
+
+ __syncthreads();
+
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+ {
+ int tileInBin = tileInBin_base + thrInBlock;
+ bool act = (tileInBin < CR_BIN_SQR);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
+ int tileY = tileInBin >> CR_BIN_LOG2;
+ bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
+
+ int ofs = s_tileStreamCurrOfs[tileInBin];
+ int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
+ int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
+
+ if (ofs >= 0)
+ tileSegNext[segIdx] = -1;
+ else if (force)
+ {
+ s_tileStreamCurrOfs[tileInBin] = 0;
+ tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
+ }
+
+ if (segCount != 0)
+ tileSegCount[segIdx] = segCount;
+
+ U32 res = __ballot_sync(actMask, ofs >= 0 | force);
+ if (threadIdx.x == 0)
+ s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
+ }
+ }
+
+ // First warp: Scan-8.
+ // One thread: Allocate space for active tiles.
+
+ __syncthreads();
+
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
+ if (scan8)
+ {
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+ U32 sum = v[0];
+ #if (CR_BIN_SQR > 1 * 32)
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+ #if (CR_BIN_SQR > 2 * 32)
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+ #if (CR_BIN_SQR > 4 * 32)
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+ #endif
+
+ if (thrInBlock == CR_BIN_SQR / 32 - 1)
+ s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
+ }
+
+ // Tile per thread: Output active tiles.
+
+ __syncthreads();
+
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+ {
+ int tileInBin = tileInBin_base + thrInBlock;
+ bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ int activeIdx = s_firstActiveIdx;
+ activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
+ activeIdx += __popc(actMask & getLaneMaskLt());
+ activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp
new file mode 100644
index 0000000..916315c
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
+#define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
+
+#define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
+#define CR_BIN_LOG2 4 // BinSize / TileSize.
+#define CR_TILE_LOG2 3 // TileSize / PixelSize.
+
+#define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
+#define CR_FLIPBIT_FLIP_Y 2
+#define CR_FLIPBIT_FLIP_X 3
+#define CR_FLIPBIT_SWAP_XY 4
+#define CR_FLIPBIT_COMPL 5
+
+#define CR_BIN_STREAMS_LOG2 4
+#define CR_BIN_SEG_LOG2 9 // 32-bit entries.
+#define CR_TILE_SEG_LOG2 5 // 32-bit entries.
+
+#define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
+#define CR_COARSE_QUEUE_LOG2 10 // Triangles.
+
+#define CR_SETUP_WARPS 2
+#define CR_SETUP_OPT_BLOCKS 8
+#define CR_BIN_WARPS 16
+#define CR_COARSE_WARPS 16 // Must be a power of two.
+#define CR_FINE_MAX_WARPS 20
+
+#define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
+#define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
+#define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
+
+#define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
+#define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
+#define CR_BIN_SIZE (1 << CR_BIN_LOG2)
+#define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
+
+#define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
+#define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
+#define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
+#define CR_TILE_SIZE (1 << CR_TILE_LOG2)
+#define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
+
+#define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
+#define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
+#define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
+
+#define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
+#define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
+
+//------------------------------------------------------------------------
+// When evaluating interpolated Z pixel centers, we may introduce an error
+// of (+-CR_LERP_ERROR) ULPs.
+
+#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
+#define CR_DEPTH_MIN CR_LERP_ERROR(3)
+#define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
new file mode 100644
index 0000000..db8bf31
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "Defs.hpp"
+#include "../CudaRaster.hpp"
+#include "RasterImpl.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// Stub interface implementation.
+//------------------------------------------------------------------------
+
+CudaRaster::CudaRaster()
+{
+ m_impl = new RasterImpl();
+}
+
+CudaRaster::~CudaRaster()
+{
+ delete m_impl;
+}
+
+void CudaRaster::setBufferSize(int width, int height, int numImages)
+{
+ m_impl->setBufferSize(Vec3i(width, height, numImages));
+}
+
+void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
+{
+ m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
+}
+
+void CudaRaster::setRenderModeFlags(U32 flags)
+{
+ m_impl->setRenderModeFlags(flags);
+}
+
+void CudaRaster::deferredClear(U32 clearColor)
+{
+ m_impl->deferredClear(clearColor);
+}
+
+void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
+{
+ m_impl->setVertexBuffer(vertices, numVertices);
+}
+
+void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
+{
+ m_impl->setIndexBuffer(indices, numTriangles);
+}
+
+bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
+{
+ return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
+}
+
+void* CudaRaster::getColorBuffer(void)
+{
+ return m_impl->getColorBuffer();
+}
+
+void* CudaRaster::getDepthBuffer(void)
+{
+ return m_impl->getDepthBuffer();
+}
+
+void CudaRaster::swapDepthAndPeel(void)
+{
+ m_impl->swapDepthAndPeel();
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp
new file mode 100644
index 0000000..7aa7774
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include
+#include
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+#ifndef NULL
+# define NULL 0
+#endif
+
+#ifdef __CUDACC__
+# define CR_CUDA 1
+#else
+# define CR_CUDA 0
+#endif
+
+#if CR_CUDA
+# define CR_CUDA_FUNC __device__ __inline__
+# define CR_CUDA_CONST __constant__
+#else
+# define CR_CUDA_FUNC inline
+# define CR_CUDA_CONST static const
+#endif
+
+#define CR_UNREF(X) ((void)(X))
+#define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
+
+//------------------------------------------------------------------------
+
+typedef uint8_t U8;
+typedef uint16_t U16;
+typedef uint32_t U32;
+typedef uint64_t U64;
+typedef int8_t S8;
+typedef int16_t S16;
+typedef int32_t S32;
+typedef int64_t S64;
+typedef float F32;
+typedef double F64;
+typedef void (*FuncPtr)(void);
+
+//------------------------------------------------------------------------
+
+#define CR_U32_MAX (0xFFFFFFFFu)
+#define CR_S32_MIN (~0x7FFFFFFF)
+#define CR_S32_MAX (0x7FFFFFFF)
+#define CR_U64_MAX ((U64)(S64)-1)
+#define CR_S64_MIN ((S64)-1 << 63)
+#define CR_S64_MAX (~((S64)-1 << 63))
+#define CR_F32_MIN (1.175494351e-38f)
+#define CR_F32_MAX (3.402823466e+38f)
+#define CR_F64_MIN (2.2250738585072014e-308)
+#define CR_F64_MAX (1.7976931348623158e+308)
+
+//------------------------------------------------------------------------
+// Misc types.
+
+class Vec2i
+{
+public:
+ Vec2i(int x_, int y_) : x(x_), y(y_) {}
+ int x, y;
+};
+
+class Vec3i
+{
+public:
+ Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+ int x, y, z;
+};
+
+//------------------------------------------------------------------------
+// CUDA utilities.
+
+#if CR_CUDA
+# define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
+#endif
+
+//------------------------------------------------------------------------
+} // namespace CR
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl
new file mode 100644
index 0000000..720e999
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl
@@ -0,0 +1,385 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Utility funcs.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
+{
+ tileZMax = CR_DEPTH_MAX;
+ tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
+}
+
+__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
+{
+ // Entry is warp-coherent.
+ if (__any_sync(~0u, tileZUpd))
+ {
+ U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
+ temp[threadIdx.x + 16] = z; __syncwarp();
+ z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+ z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+ z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+ z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+ z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+ tileZMax = temp[47];
+ tileZUpd = false;
+ }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
+{
+ const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
+ const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+ const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
+ const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+ if (threadIdx.x >= tileSegCount[segment])
+ {
+ triIdx = -1;
+ dataIdx = -1;
+ }
+ else
+ {
+ int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
+ triIdx = subtriIdx >> 3;
+ dataIdx = triIdx;
+ subtriIdx &= 7;
+ if (subtriIdx != 7)
+ dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
+ triHeader = *((uint4*)triHeaderPtr + dataIdx);
+ }
+
+ // advance to next segment
+ segment = tileSegNext[segment];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
+{
+ U32 zmin = triHeader.w & 0xFFFFF000;
+ return (zmin > tileZMax);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
+{
+ int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
+ int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
+
+ // extract S16 vertex positions while subtracting tile coordinates
+ S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
+ S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
+ S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
+ S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
+ S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
+ S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
+
+ // extract flipbits
+ U32 f01 = (triHeader.w >> 6) & 0x3C;
+ U32 f12 = (triHeader.w >> 2) & 0x3C;
+ U32 f20 = (triHeader.w << 2) & 0x3C;
+
+ // compute per-edge coverage masks
+ U64 c01, c12, c20;
+ c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
+ c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
+ c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
+
+ // combine masks
+ return c01 & c12 & c20;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
+{
+ __syncwarp();
+ temp[threadIdx.x + 16] = value; __syncwarp();
+ value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+ value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+ value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+ value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+ value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+ return value;
+}
+
+__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
+{
+ return temp[47];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ S32 findBit(U64 mask, int idx)
+{
+ U32 x = getLo(mask);
+ int pop = __popc(x);
+ bool p = (pop <= idx);
+ if (p) x = getHi(mask);
+ if (p) idx -= pop;
+ int bit = p ? 32 : 0;
+
+ pop = __popc(x & 0x0000ffffu);
+ p = (pop <= idx);
+ if (p) x >>= 16;
+ if (p) bit += 16;
+ if (p) idx -= pop;
+
+ U32 tmp = x & 0x000000ffu;
+ pop = __popc(tmp);
+ p = (pop <= idx);
+ if (p) tmp = x & 0x0000ff00u;
+ if (p) idx -= pop;
+
+ return findLeadingOne(tmp) + bit - idx;
+}
+
+//------------------------------------------------------------------------
+// Single-sample implementation.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
+{
+ atomicMin((U32*)pDepth, depth);
+ __syncwarp(ropMask);
+ bool act = (depth == *pDepth);
+ __syncwarp(ropMask);
+ U32 actMask = __ballot_sync(ropMask, act);
+ if (act)
+ {
+ *pDepth = 0;
+ __syncwarp(actMask);
+ atomicMax((U32*)pDepth, threadIdx.x);
+ __syncwarp(actMask);
+ if (*pDepth == threadIdx.x)
+ {
+ *pDepth = depth;
+ *pColor = color;
+ }
+ __syncwarp(actMask);
+ }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void fineRasterImpl(const CRParams p)
+{
+ // for 20 warps:
+ __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
+ __shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+ __shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+ __shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+ __shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
+ __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
+ __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
+ __shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
+ // = 47.25KB total
+
+ CRAtomics& atomics = p.atomics[blockIdx.z];
+ const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
+
+ const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
+ const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+
+ volatile U32* tileColor = s_tileColor[threadIdx.y];
+ volatile U32* tileDepth = s_tileDepth[threadIdx.y];
+ volatile U32* tilePeel = s_tilePeel[threadIdx.y];
+ volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
+ volatile U64* triangleCov = s_triangleCov[threadIdx.y];
+ volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
+ volatile U32* temp = s_temp[threadIdx.y];
+
+ if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
+ return;
+
+ temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
+ cover8x8_setupLUT(s_cover8x8_lut);
+ __syncthreads();
+
+ // loop over tiles
+ for (;;)
+ {
+ // pick a tile
+ if (threadIdx.x == 0)
+ temp[16] = atomicAdd(&atomics.fineCounter, 1);
+ __syncwarp();
+ int activeIdx = temp[16];
+ if (activeIdx >= atomics.numActiveTiles)
+ break;
+
+ int tileIdx = activeTiles[activeIdx];
+ S32 segment = tileFirstSeg[tileIdx];
+ int tileY = tileIdx / p.widthTiles;
+ int tileX = tileIdx - tileY * p.widthTiles;
+ int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+ int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+
+ // initialize per-tile state
+ int triRead = 0, triWrite = 0;
+ int fragRead = 0, fragWrite = 0;
+ if (threadIdx.x == 0)
+ triangleFrag[63] = 0; // "previous triangle"
+
+ // deferred clear => clear tile
+ if (p.deferredClear)
+ {
+ tileColor[threadIdx.x] = p.clearColor;
+ tileDepth[threadIdx.x] = p.clearDepth;
+ tileColor[threadIdx.x + 32] = p.clearColor;
+ tileDepth[threadIdx.x + 32] = p.clearDepth;
+ }
+ else // otherwise => read tile from framebuffer
+ {
+ U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+ U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+ tileColor[threadIdx.x] = pColor[px + p.strideX * py];
+ tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
+ tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
+ tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
+ }
+
+ // read peeling inputs if enabled
+ if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
+ {
+ U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
+ tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
+ tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
+ }
+
+ U32 tileZMax;
+ bool tileZUpd;
+ initTileZMax(tileZMax, tileZUpd, tileDepth);
+
+ // process fragments
+ for(;;)
+ {
+ // need to queue more fragments?
+ if (fragWrite - fragRead < 32 && segment >= 0)
+ {
+ // update tile z - coherent over warp
+ updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
+
+ // read triangles
+ do
+ {
+ // read triangle index and data, advance to next segment
+ S32 triIdx, dataIdx;
+ uint4 triHeader;
+ getTriangle(p, triIdx, dataIdx, triHeader, segment);
+
+ // early z cull
+ if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
+ triIdx = -1;
+
+ // determine coverage
+ U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
+ S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
+
+ // fragment count scan
+ U32 frag = scan32_value(pop, temp);
+ frag += fragWrite; // frag now holds cumulative fragment count
+ fragWrite += scan32_total(temp);
+
+ // queue non-empty triangles
+ U32 goodMask = __ballot_sync(~0u, pop != 0);
+ if (pop != 0)
+ {
+ int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
+ triDataIdx [idx] = dataIdx;
+ triangleFrag[idx] = frag;
+ triangleCov [idx] = coverage;
+ }
+ triWrite += __popc(goodMask);
+ }
+ while (fragWrite - fragRead < 32 && segment >= 0);
+ }
+ __syncwarp();
+
+ // end of segment?
+ if (fragRead == fragWrite)
+ break;
+
+ // clear triangle boundaries
+ temp[threadIdx.x + 16] = 0;
+ __syncwarp();
+
+ // tag triangle boundaries
+ if (triRead + threadIdx.x < triWrite)
+ {
+ int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
+ if (idx <= 32)
+ temp[idx + 16 - 1] = 1;
+ }
+ __syncwarp();
+
+ int ropLaneIdx = threadIdx.x;
+ U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
+
+ // distribute fragments
+ bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
+ U32 fragmentMask = __ballot_sync(~0u, hasFragment);
+ if (hasFragment)
+ {
+ int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
+ int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
+ U64 coverage = triangleCov[triBufIdx];
+ int pixelInTile = findBit(coverage, fragIdx);
+ int dataIdx = triDataIdx[triBufIdx];
+
+ // determine pixel position
+ U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
+ U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
+
+ // depth test
+ U32 depth = 0;
+ uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
+
+ depth = td.x * pixelX + td.y * pixelY + td.z;
+ bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
+ if (!zkill)
+ {
+ U32 oldDepth = tileDepth[pixelInTile];
+ if (depth > oldDepth)
+ zkill = true;
+ else if (oldDepth == tileZMax)
+ tileZUpd = true; // we are replacing previous zmax => need to update
+ }
+
+ U32 ropMask = __ballot_sync(fragmentMask, !zkill);
+ if (!zkill)
+ executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
+ }
+ // no need to sync, as next up is updateTileZMax that does internal warp sync
+
+ // update counters
+ fragRead = ::min(fragRead + 32, fragWrite);
+ triRead += __popc(boundaryMask);
+ }
+
+ // Write tile back to the framebuffer.
+ if (true)
+ {
+ int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+ int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+ U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+ U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+ pColor[px + p.strideX * py] = tileColor[threadIdx.x];
+ pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
+ pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
+ pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
+ }
+ }
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
new file mode 100644
index 0000000..26133c9
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
@@ -0,0 +1,153 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+#include "Constants.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+// Projected triangle.
+//------------------------------------------------------------------------
+
+struct CRTriangleHeader
+{
+ S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
+ S16 v0y;
+ S16 v1x;
+ S16 v1y;
+ S16 v2x;
+ S16 v2y;
+
+ U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
+};
+
+//------------------------------------------------------------------------
+
+struct CRTriangleData
+{
+ U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
+ U32 zy;
+ U32 zb;
+ U32 id; // Triangle id.
+};
+
+//------------------------------------------------------------------------
+// Device-side structures.
+//------------------------------------------------------------------------
+
+struct CRAtomics
+{
+ // Setup.
+ S32 numSubtris; // = numTris
+
+ // Bin.
+ S32 binCounter; // = 0
+ S32 numBinSegs; // = 0
+
+ // Coarse.
+ S32 coarseCounter; // = 0
+ S32 numTileSegs; // = 0
+ S32 numActiveTiles; // = 0
+
+ // Fine.
+ S32 fineCounter; // = 0
+};
+
+//------------------------------------------------------------------------
+
+struct CRImageParams
+{
+ S32 triOffset; // First triangle index to draw.
+ S32 triCount; // Number of triangles to draw.
+ S32 binBatchSize; // Number of triangles per batch.
+};
+
+//------------------------------------------------------------------------
+
+struct CRParams
+{
+ // Common.
+
+ CRAtomics* atomics; // Work counters. Per-image.
+ S32 numImages; // Batch size.
+ S32 totalCount; // In range mode, total number of triangles to render.
+ S32 instanceMode; // 0 = range mode, 1 = instance mode.
+
+ S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
+ S32 numTriangles; // Number of triangles in input buffer.
+ void* vertexBuffer; // numVertices * float4(x, y, z, w)
+ void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
+
+ S32 widthPixels; // Render buffer size in pixels. Must be multiple of tile size (8x8).
+ S32 heightPixels;
+ S32 widthPixelsVp; // Viewport size in pixels.
+ S32 heightPixelsVp;
+ S32 widthBins; // widthPixels / CR_BIN_SIZE
+ S32 heightBins; // heightPixels / CR_BIN_SIZE
+ S32 numBins; // widthBins * heightBins
+
+ F32 xs; // Vertex position adjustments for tiled rendering.
+ F32 ys;
+ F32 xo;
+ F32 yo;
+
+ S32 widthTiles; // widthPixels / CR_TILE_SIZE
+ S32 heightTiles; // heightPixels / CR_TILE_SIZE
+ S32 numTiles; // widthTiles * heightTiles
+
+ U32 renderModeFlags;
+ S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
+ U32 clearColor;
+ U32 clearDepth;
+
+ // These are uniform across batch.
+
+ S32 maxSubtris;
+ S32 maxBinSegs;
+ S32 maxTileSegs;
+
+ // Setup output / bin input.
+
+ void* triSubtris; // maxSubtris * U8
+ void* triHeader; // maxSubtris * CRTriangleHeader
+ void* triData; // maxSubtris * CRTriangleData
+
+ // Bin output / coarse input.
+
+ void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
+ void* binSegNext; // maxBinSegs * S32
+ void* binSegCount; // maxBinSegs * S32
+ void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
+ void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
+
+ // Coarse output / fine input.
+
+ void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
+ void* tileSegNext; // maxTileSegs * S32
+ void* tileSegCount; // maxTileSegs * S32
+ void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
+ void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
+
+ // Surface buffers. Outer tile offset is baked into pointers.
+
+ void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
+ void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
+ void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
+ S32 strideX; // horizontal size in pixels
+ S32 strideY; // vertical stride in pixels
+
+ // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
+
+ CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
+ const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
new file mode 100644
index 0000000..f7f05d5
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
@@ -0,0 +1,370 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "RasterImpl.hpp"
+#include
+
+using namespace CR;
+using std::min;
+using std::max;
+
+//------------------------------------------------------------------------
+// Kernel prototypes and variables.
+
+void triangleSetupKernel (const CRParams p);
+void binRasterKernel (const CRParams p);
+void coarseRasterKernel (const CRParams p);
+void fineRasterKernel (const CRParams p);
+
+//------------------------------------------------------------------------
+
+RasterImpl::RasterImpl(void)
+: m_renderModeFlags (0),
+ m_deferredClear (false),
+ m_clearColor (0),
+ m_vertexPtr (NULL),
+ m_indexPtr (NULL),
+ m_numVertices (0),
+ m_numTriangles (0),
+ m_bufferSizesReported (0),
+
+ m_numImages (0),
+ m_bufferSizePixels (0, 0),
+ m_bufferSizeVp (0, 0),
+ m_sizePixels (0, 0),
+ m_sizeVp (0, 0),
+ m_offsetPixels (0, 0),
+ m_sizeBins (0, 0),
+ m_numBins (0),
+ m_sizeTiles (0, 0),
+ m_numTiles (0),
+
+ m_numSMs (1),
+ m_numCoarseBlocksPerSM (1),
+ m_numFineBlocksPerSM (1),
+ m_numFineWarpsPerBlock (1),
+
+ m_maxSubtris (1),
+ m_maxBinSegs (1),
+ m_maxTileSegs (1)
+{
+ // Query relevant device attributes.
+
+ int currentDevice = 0;
+ NVDR_CHECK_CUDA_ERROR(cudaGetDevice(¤tDevice));
+ NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
+ cudaFuncAttributes attr;
+ NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
+ m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
+ NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
+ NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
+
+ // Setup functions.
+
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
+}
+
+//------------------------------------------------------------------------
+
+RasterImpl::~RasterImpl(void)
+{
+ // Empty.
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setBufferSize(Vec3i size)
+{
+ // Internal buffer width and height must be divisible by tile size.
+ int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+ int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+ m_bufferSizePixels = Vec2i(w, h);
+ m_bufferSizeVp = Vec2i(size.x, size.y);
+ m_numImages = size.z;
+
+ m_colorBuffer.reset(w * h * size.z * sizeof(U32));
+ m_depthBuffer.reset(w * h * size.z * sizeof(U32));
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setViewport(Vec2i size, Vec2i offset)
+{
+ // Offset must be divisible by tile size.
+ NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
+
+ // Round internal viewport size to multiples of tile size.
+ int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+ int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+ m_sizePixels = Vec2i(w, h);
+ m_offsetPixels = offset;
+ m_sizeVp = Vec2i(size.x, size.y);
+ m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
+ m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
+ m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
+ m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+ m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+ m_numBins = m_sizeBins.x * m_sizeBins.y;
+}
+
+void RasterImpl::swapDepthAndPeel(void)
+{
+ m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
+
+ void* tmp = m_depthBuffer.getPtr();
+ m_depthBuffer.setPtr(m_peelBuffer.getPtr());
+ m_peelBuffer.setPtr(tmp);
+}
+
+//------------------------------------------------------------------------
+
+bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
+{
+ bool instanceMode = (!ranges);
+
+ int maxSubtrisSlack = 4096; // x 81B = 324KB
+ int maxBinSegsSlack = 256; // x 2137B = 534KB
+ int maxTileSegsSlack = 4096; // x 136B = 544KB
+
+ // Resize atomics as needed.
+ m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
+ m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
+
+ // Size of these buffers doesn't depend on input.
+ m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+ m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+ m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+ m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+
+ // Construct per-image parameters and determine worst-case buffer sizes.
+ m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
+ CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+ for (int i=0; i < m_numImages; i++)
+ {
+ CRImageParams& ip = imageParams[i];
+
+ int roundSize = CR_BIN_WARPS * 32;
+ int minBatches = CR_BIN_STREAMS_SIZE * 2;
+ int maxRounds = 32;
+
+ ip.triOffset = instanceMode ? 0 : ranges[i].x;
+ ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
+ ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
+
+ m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+ m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
+ m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
+ }
+
+ // Retry until successful.
+
+ for (;;)
+ {
+ // Allocate buffers.
+ m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
+ m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
+ m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
+
+ m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
+ m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
+ m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
+
+ m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
+ m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
+ m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
+
+ // Report if buffers grow from last time.
+ size_t sizesTotal = getTotalBufferSizes();
+ if (sizesTotal > m_bufferSizesReported)
+ {
+ size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
+ sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
+ LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
+ m_bufferSizesReported = sizesMB << 20;
+ }
+
+ // Launch stages. Blocks until everything is done.
+ launchStages(instanceMode, peel, stream);
+
+ // Peeling iteration cannot fail, so no point checking things further.
+ if (peel)
+ break;
+
+ // Atomics after coarse stage are now available.
+ CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+
+ // Success?
+ bool failed = false;
+ for (int i=0; i < m_numImages; i++)
+ {
+ const CRAtomics& a = atomics[i];
+ failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
+ }
+ if (!failed)
+ break; // Success!
+
+ // If we were already at maximum capacity, no can do.
+ if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
+ return false;
+
+ // Enlarge buffers and try again.
+ for (int i=0; i < m_numImages; i++)
+ {
+ const CRAtomics& a = atomics[i];
+ m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+ m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
+ m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
+ }
+ }
+
+ m_deferredClear = false;
+ return true; // Success.
+}
+
+//------------------------------------------------------------------------
+
+size_t RasterImpl::getTotalBufferSizes(void) const
+{
+ return
+ m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
+ m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
+ m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
+ m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
+{
+ CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+
+ // Unless peeling, initialize atomics to mostly zero.
+ CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+ if (!peel)
+ {
+ memset(atomics, 0, m_numImages * sizeof(CRAtomics));
+ for (int i=0; i < m_numImages; i++)
+ atomics[i].numSubtris = imageParams[i].triCount;
+ }
+
+ // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+
+ // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
+ if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
+ {
+ int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
+ m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
+ }
+
+ // Set global parameters.
+ CRParams p;
+ {
+ p.atomics = (CRAtomics*)m_crAtomics.getPtr();
+ p.numImages = m_numImages;
+ p.totalCount = 0; // Only relevant in range mode.
+ p.instanceMode = instanceMode ? 1 : 0;
+
+ p.numVertices = m_numVertices;
+ p.numTriangles = m_numTriangles;
+ p.vertexBuffer = m_vertexPtr;
+ p.indexBuffer = m_indexPtr;
+
+ p.widthPixels = m_sizePixels.x;
+ p.heightPixels = m_sizePixels.y;
+ p.widthPixelsVp = m_sizeVp.x;
+ p.heightPixelsVp = m_sizeVp.y;
+ p.widthBins = m_sizeBins.x;
+ p.heightBins = m_sizeBins.y;
+ p.numBins = m_numBins;
+
+ p.xs = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
+ p.ys = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
+ p.xo = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
+ p.yo = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
+
+ p.widthTiles = m_sizeTiles.x;
+ p.heightTiles = m_sizeTiles.y;
+ p.numTiles = m_numTiles;
+
+ p.renderModeFlags = m_renderModeFlags;
+ p.deferredClear = m_deferredClear ? 1 : 0;
+ p.clearColor = m_clearColor;
+ p.clearDepth = CR_DEPTH_MAX;
+
+ p.maxSubtris = m_maxSubtris;
+ p.maxBinSegs = m_maxBinSegs;
+ p.maxTileSegs = m_maxTileSegs;
+
+ p.triSubtris = m_triSubtris.getPtr();
+ p.triHeader = m_triHeader.getPtr();
+ p.triData = m_triData.getPtr();
+ p.binSegData = m_binSegData.getPtr();
+ p.binSegNext = m_binSegNext.getPtr();
+ p.binSegCount = m_binSegCount.getPtr();
+ p.binFirstSeg = m_binFirstSeg.getPtr();
+ p.binTotal = m_binTotal.getPtr();
+ p.tileSegData = m_tileSegData.getPtr();
+ p.tileSegNext = m_tileSegNext.getPtr();
+ p.tileSegCount = m_tileSegCount.getPtr();
+ p.activeTiles = m_activeTiles.getPtr();
+ p.tileFirstSeg = m_tileFirstSeg.getPtr();
+
+ size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
+ p.colorBuffer = m_colorBuffer.getPtr(byteOffset);
+ p.depthBuffer = m_depthBuffer.getPtr(byteOffset);
+ p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
+ p.strideX = m_bufferSizePixels.x;
+ p.strideY = m_bufferSizePixels.y;
+
+ memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
+ p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
+ }
+
+ // Setup block sizes.
+
+ dim3 brBlock(32, CR_BIN_WARPS);
+ dim3 crBlock(32, CR_COARSE_WARPS);
+ dim3 frBlock(32, m_numFineWarpsPerBlock);
+ void* args[] = {&p};
+
+ // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
+ if (!peel)
+ {
+ if (instanceMode)
+ {
+ int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+ }
+ else
+ {
+ for (int i=0; i < m_numImages; i++)
+ p.totalCount += imageParams[i].triCount;
+ int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+ }
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+ }
+
+ // Fine rasterizer is launched always.
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
new file mode 100644
index 0000000..43b1edf
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../CudaRaster.hpp"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "Util.inl"
+
+namespace CR
+{
+
+//------------------------------------------------------------------------
+// Stage implementations.
+//------------------------------------------------------------------------
+
+#include "TriangleSetup.inl"
+#include "BinRaster.inl"
+#include "CoarseRaster.inl"
+#include "FineRaster.inl"
+
+}
+
+//------------------------------------------------------------------------
+// Stage entry points.
+//------------------------------------------------------------------------
+
+__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS) triangleSetupKernel (const CR::CRParams p) { CR::triangleSetupImpl(p); }
+__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1) binRasterKernel (const CR::CRParams p) { CR::binRasterImpl(p); }
+__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1) coarseRasterKernel (const CR::CRParams p) { CR::coarseRasterImpl(p); }
+__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1) fineRasterKernel (const CR::CRParams p) { CR::fineRasterImpl(p); }
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
new file mode 100644
index 0000000..d594acd
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "PrivateDefs.hpp"
+#include "Buffer.hpp"
+#include "../CudaRaster.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class RasterImpl
+{
+public:
+ RasterImpl (void);
+ ~RasterImpl (void);
+
+ void setBufferSize (Vec3i size);
+ void setViewport (Vec2i size, Vec2i offset);
+ void setRenderModeFlags (U32 flags) { m_renderModeFlags = flags; }
+ void deferredClear (U32 color) { m_deferredClear = true; m_clearColor = color; }
+ void setVertexBuffer (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
+ void setIndexBuffer (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
+ bool drawTriangles (const Vec2i* ranges, bool peel, cudaStream_t stream);
+ void* getColorBuffer (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
+ void* getDepthBuffer (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
+ void swapDepthAndPeel (void);
+ size_t getTotalBufferSizes (void) const;
+
+private:
+ void launchStages (bool instanceMode, bool peel, cudaStream_t stream);
+
+ // State.
+
+ unsigned int m_renderModeFlags;
+ bool m_deferredClear;
+ unsigned int m_clearColor;
+ void* m_vertexPtr;
+ void* m_indexPtr;
+ int m_numVertices; // Input buffer size.
+ int m_numTriangles; // Input buffer size.
+ size_t m_bufferSizesReported; // Previously reported buffer sizes.
+
+ // Surfaces.
+
+ Buffer m_colorBuffer;
+ Buffer m_depthBuffer;
+ Buffer m_peelBuffer;
+ int m_numImages;
+ Vec2i m_bufferSizePixels; // Internal buffer size.
+ Vec2i m_bufferSizeVp; // Total viewport size.
+ Vec2i m_sizePixels; // Internal size at which all computation is done, buffers reserved, etc.
+ Vec2i m_sizeVp; // Size to which output will be cropped outside, determines viewport size.
+ Vec2i m_offsetPixels; // Viewport offset for tiled rendering.
+ Vec2i m_sizeBins;
+ S32 m_numBins;
+ Vec2i m_sizeTiles;
+ S32 m_numTiles;
+
+ // Launch sizes etc.
+
+ S32 m_numSMs;
+ S32 m_numCoarseBlocksPerSM;
+ S32 m_numFineBlocksPerSM;
+ S32 m_numFineWarpsPerBlock;
+
+ // Global intermediate buffers. Individual images have offsets to these.
+
+ Buffer m_crAtomics;
+ HostBuffer m_crAtomicsHost;
+ HostBuffer m_crImageParamsHost;
+ Buffer m_crImageParamsExtra;
+ Buffer m_triSubtris;
+ Buffer m_triHeader;
+ Buffer m_triData;
+ Buffer m_binFirstSeg;
+ Buffer m_binTotal;
+ Buffer m_binSegData;
+ Buffer m_binSegNext;
+ Buffer m_binSegCount;
+ Buffer m_activeTiles;
+ Buffer m_tileFirstSeg;
+ Buffer m_tileSegData;
+ Buffer m_tileSegNext;
+ Buffer m_tileSegCount;
+
+ // Actual buffer sizes.
+
+ S32 m_maxSubtris;
+ S32 m_maxBinSegs;
+ S32 m_maxTileSegs;
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
new file mode 100644
index 0000000..276f0a4
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
@@ -0,0 +1,402 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void snapTriangle(
+ const CRParams& p,
+ float4 v0, float4 v1, float4 v2,
+ int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
+{
+ F32 viewScaleX = (F32)(p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+ F32 viewScaleY = (F32)(p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+ rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
+ p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
+ p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
+ p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
+ lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
+ hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
+{
+ U32 flips = 0;
+ if (dy > 0 || (dy == 0 && dx <= 0))
+ flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
+ if (dx > 0)
+ flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
+ if (::abs(dx) < ::abs(dy))
+ flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
+ return flips;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool prepareTriangle(
+ const CRParams& p,
+ int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
+ int2& d1, int2& d2, S32& area)
+{
+ // Backfacing or degenerate => cull.
+
+ d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
+ d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
+ area = d1.x * d2.y - d1.y * d2.x;
+
+ if (area == 0)
+ return false; // Degenerate.
+
+ if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
+ return false; // Backfacing.
+
+ // AABB falls between samples => cull.
+
+ int sampleSize = 1 << CR_SUBPIXEL_LOG2;
+ int biasX = (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+ int biasY = (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+ int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
+ int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
+ int hix = (hi.x + biasX) & -sampleSize;
+ int hiy = (hi.y + biasY) & -sampleSize;
+
+ if (lox > hix || loy > hiy)
+ return false; // Between pixels.
+
+ // AABB covers 1 or 2 samples => cull if they are not covered.
+
+ int diff = add_sub(hix, hiy, lox) - loy;
+ if (diff <= sampleSize)
+ {
+ int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
+ int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
+ int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
+ S32 e0 = t0.x * t1.y - t0.y * t1.x;
+ S32 e1 = t1.x * t2.y - t1.y * t2.x;
+ S32 e2 = t2.x * t0.y - t2.y * t0.x;
+ if (area < 0)
+ {
+ e0 = -e0;
+ e1 = -e1;
+ e2 = -e2;
+ }
+
+ if (e0 < 0 || e1 < 0 || e2 < 0)
+ {
+ if (diff == 0)
+ return false; // Between pixels.
+
+ t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
+ t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
+ t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
+ e0 = t0.x * t1.y - t0.y * t1.x;
+ e1 = t1.x * t2.y - t1.y * t2.x;
+ e2 = t2.x * t0.y - t2.y * t0.x;
+ if (area < 0)
+ {
+ e0 = -e0;
+ e1 = -e1;
+ e2 = -e2;
+ }
+
+ if (e0 < 0 || e1 < 0 || e2 < 0)
+ return false; // Between pixels.
+ }
+ }
+
+ // Otherwise => proceed to output the triangle.
+
+ return true; // Visible.
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void setupTriangle(
+ const CRParams& p,
+ CRTriangleHeader* th, CRTriangleData* td, int triId,
+ float v0z, float v1z, float v2z,
+ int2 p0, int2 p1, int2 p2, float3 rcpW,
+ int2 d1, int2 d2, S32 area)
+{
+ // Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
+ // disabled (if it is enabled, we never come here with area < 0).
+
+ if (area < 0)
+ {
+ swap(d1, d2);
+ swap(p1, p2);
+ swap(v1z, v2z);
+ swap(rcpW.y, rcpW.z);
+ area = -area;
+ }
+
+ int2 wv0;
+ wv0.x = p0.x + (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+ wv0.y = p0.y + (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+
+ // Setup depth plane equation.
+
+ F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
+ F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
+ float3 zvert = make_float3(
+ (v0z * zcoef) * rcpW.x + zbias,
+ (v1z * zcoef) * rcpW.y + zbias,
+ (v2z * zcoef) * rcpW.z + zbias
+ );
+ int2 zv0 = make_int2(
+ wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
+ wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
+ );
+ uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
+
+ U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
+
+ // Write CRTriangleData.
+
+ *(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
+
+ // Determine flipbits.
+
+ U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
+ U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
+ U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
+
+ // Write CRTriangleHeader.
+
+ *(uint4*)th = make_uint4(
+ prmt(p0.x, p0.y, 0x5410),
+ prmt(p1.x, p1.y, 0x5410),
+ prmt(p2.x, p2.y, 0x5410),
+ (zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void triangleSetupImpl(const CRParams p)
+{
+ __shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
+ F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
+
+ // Compute task and image indices.
+
+ int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
+ int imageIdx = 0;
+ if (p.instanceMode)
+ {
+ imageIdx = blockIdx.z;
+ if (taskIdx >= p.numTriangles)
+ return;
+ }
+ else
+ {
+ while (imageIdx < p.numImages)
+ {
+ int count = getImageParams(p, imageIdx).triCount;
+ if (taskIdx < count)
+ break;
+ taskIdx -= count;
+ imageIdx += 1;
+ }
+ if (imageIdx == p.numImages)
+ return;
+ }
+
+ // Per-image data structures.
+
+ const CRImageParams& ip = getImageParams(p, imageIdx);
+ CRAtomics& atomics = p.atomics[imageIdx];
+
+ const int* indexBuffer = (const int*)p.indexBuffer;
+ U8* triSubtris = (U8*)p.triSubtris + imageIdx * p.maxSubtris;
+ CRTriangleHeader* triHeader = (CRTriangleHeader*)p.triHeader + imageIdx * p.maxSubtris;
+ CRTriangleData* triData = (CRTriangleData*)p.triData + imageIdx * p.maxSubtris;
+
+ // Determine triangle index.
+
+ int triIdx = taskIdx;
+ if (!p.instanceMode)
+ triIdx += ip.triOffset;
+
+ // Read vertex indices.
+
+ if ((U32)triIdx >= (U32)p.numTriangles)
+ {
+ // Bad triangle index.
+ triSubtris[taskIdx] = 0;
+ return;
+ }
+
+ uint4 vidx;
+ vidx.x = indexBuffer[triIdx * 3 + 0];
+ vidx.y = indexBuffer[triIdx * 3 + 1];
+ vidx.z = indexBuffer[triIdx * 3 + 2];
+ vidx.w = triIdx + 1; // Triangle index.
+
+ if (vidx.x >= (U32)p.numVertices ||
+ vidx.y >= (U32)p.numVertices ||
+ vidx.z >= (U32)p.numVertices)
+ {
+ // Bad vertex index.
+ triSubtris[taskIdx] = 0;
+ return;
+ }
+
+ // Read vertex positions.
+
+ const float4* vertexBuffer = (const float4*)p.vertexBuffer;
+ if (p.instanceMode)
+ vertexBuffer += p.numVertices * imageIdx; // Instance offset.
+
+ float4 v0 = vertexBuffer[vidx.x];
+ float4 v1 = vertexBuffer[vidx.y];
+ float4 v2 = vertexBuffer[vidx.z];
+
+ // Adjust vertex positions according to current viewport size and offset.
+
+ v0.x = v0.x * p.xs + v0.w * p.xo;
+ v0.y = v0.y * p.ys + v0.w * p.yo;
+ v1.x = v1.x * p.xs + v1.w * p.xo;
+ v1.y = v1.y * p.ys + v1.w * p.yo;
+ v2.x = v2.x * p.xs + v2.w * p.xo;
+ v2.y = v2.y * p.ys + v2.w * p.yo;
+
+ // Outside view frustum => cull.
+
+ if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
+ {
+ if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
+ (v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
+ (v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
+ (v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
+ (v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
+ (v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
+ {
+ triSubtris[taskIdx] = 0;
+ return;
+ }
+ }
+
+ // Inside depth range => try to snap vertices.
+
+ if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
+ {
+ // Inside S16 range and small enough => fast path.
+ // Note: aabbLimit comes from the fact that cover8x8
+ // does not support guardband with maximal viewport.
+
+ int2 p0, p1, p2, lo, hi;
+ float3 rcpW;
+
+ snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+ S32 loxy = ::min(lo.x, lo.y);
+ S32 hixy = ::max(hi.x, hi.y);
+ S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
+
+ if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
+ {
+ int2 d1, d2;
+ S32 area;
+ bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
+ triSubtris[taskIdx] = res ? 1 : 0;
+
+ if (res)
+ setupTriangle(
+ p,
+ &triHeader[taskIdx], &triData[taskIdx], vidx.w,
+ v0.z, v1.z, v2.z,
+ p0, p1, p2, rcpW,
+ d1, d2, area);
+
+ return;
+ }
+ }
+
+ // Clip to view frustum.
+
+ float4 ov0 = v0;
+ float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
+ float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
+ int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
+
+ // Count non-culled subtriangles.
+
+ v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
+ v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
+ v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
+ v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
+ v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
+ v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
+ v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
+ v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
+ float4 tv1 = v1;
+
+ int numSubtris = 0;
+ for (int i = 2; i < numVerts; i++)
+ {
+ v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+ v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+ v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+ v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+ int2 p0, p1, p2, lo, hi, d1, d2;
+ float3 rcpW;
+ S32 area;
+
+ snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+ if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+ numSubtris++;
+
+ v1 = v2;
+ }
+
+ triSubtris[taskIdx] = numSubtris;
+
+ // Multiple subtriangles => allocate.
+
+ int subtriBase = taskIdx;
+ if (numSubtris > 1)
+ {
+ subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
+ triHeader[taskIdx].misc = subtriBase;
+ if (subtriBase + numSubtris > p.maxSubtris)
+ numVerts = 0;
+ }
+
+ // Setup subtriangles.
+
+ v1 = tv1;
+ for (int i = 2; i < numVerts; i++)
+ {
+ v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+ v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+ v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+ v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+ int2 p0, p1, p2, lo, hi, d1, d2;
+ float3 rcpW;
+ S32 area;
+
+ snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+ if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+ {
+ setupTriangle(
+ p,
+ &triHeader[subtriBase], &triData[subtriBase], vidx.w,
+ v0.z, v1.z, v2.z,
+ p0, p1, p2, rcpW,
+ d1, d2, area);
+
+ subtriBase++;
+ }
+
+ v1 = v2;
+ }
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl
new file mode 100644
index 0000000..f8faeba
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl
@@ -0,0 +1,452 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "PrivateDefs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+template __device__ __inline__ void swap(T& a, T& b) { T t = a; a = b; b = t; }
+
+__device__ __inline__ U32 getLo (U64 a) { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ S32 getLo (S64 a) { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ U32 getHi (U64 a) { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ S32 getHi (S64 a) { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ U64 combineLoHi (U32 lo, U32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ S64 combineLoHi (S32 lo, S32 hi) { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ U32 getLaneMaskLt (void) { U32 r; asm("mov.u32 %0, %lanemask_lt;" : "=r"(r)); return r; }
+__device__ __inline__ U32 getLaneMaskLe (void) { U32 r; asm("mov.u32 %0, %lanemask_le;" : "=r"(r)); return r; }
+__device__ __inline__ U32 getLaneMaskGt (void) { U32 r; asm("mov.u32 %0, %lanemask_gt;" : "=r"(r)); return r; }
+__device__ __inline__ U32 getLaneMaskGe (void) { U32 r; asm("mov.u32 %0, %lanemask_ge;" : "=r"(r)); return r; }
+__device__ __inline__ int findLeadingOne (U32 v) { U32 r; asm("bfind.u32 %0, %1;" : "=r"(r) : "r"(v)); return r; }
+__device__ __inline__ bool singleLane (void) { return ((::__ballot_sync(~0u, true) & getLaneMaskLt()) == 0); }
+
+__device__ __inline__ void add_add_carry (U32& rlo, U32 alo, U32 blo, U32& rhi, U32 ahi, U32 bhi) { U64 r = combineLoHi(alo, ahi) + combineLoHi(blo, bhi); rlo = getLo(r); rhi = getHi(r); }
+__device__ __inline__ S32 f32_to_s32_sat (F32 a) { S32 v; asm("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32 f32_to_u32_sat (F32 a) { U32 v; asm("cvt.rni.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32 f32_to_u32_sat_rmi (F32 a) { U32 v; asm("cvt.rmi.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32 f32_to_u8_sat (F32 a) { U32 v; asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ S64 f32_to_s64 (F32 a) { S64 v; asm("cvt.rni.s64.f32 %0, %1;" : "=l"(v) : "f"(a)); return v; }
+__device__ __inline__ S32 add_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 add_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 add_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 add_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_s16lo_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_s16hi_s16lo (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_s16lo_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_s16hi_s16hi (S32 a, S32 b) { S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_u16lo_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_u16hi_u16lo (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_u16lo_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32 sub_u16hi_u16hi (U32 a, U32 b) { S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32 add_b0 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b0, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32 add_b1 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32 add_b2 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b2, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32 add_b3 (U32 a, U32 b) { U32 v; asm("vadd.u32.u32.u32 %0, %1.b3, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32 vmad_b0 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b1 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b2 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b0_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b1_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b2_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 vmad_b3_b3 (U32 a, U32 b, U32 c) { U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 add_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vadd.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ U32 sub_mask8 (U32 a, U32 b) { U32 v; U32 z=0; asm("vsub.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ S32 max_max (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 min_min (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 max_add (S32 a, S32 b, S32 c) { S32 v; asm("vmax.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 min_add (S32 a, S32 b, S32 c) { S32 v; asm("vmin.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 add_add (U32 a, U32 b, U32 c) { U32 v; asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 sub_add (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 add_sub (U32 a, U32 b, U32 c) { U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(c), "r"(b)); return v; }
+__device__ __inline__ S32 add_clamp_0_x (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 add_clamp_b0 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 add_clamp_b2 (S32 a, S32 b, S32 c) { S32 v; asm("vadd.u32.s32.s32.sat %0.b2, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32 prmt (U32 a, U32 b, U32 c) { U32 v; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 u32lo_sext (U32 a) { U32 v; asm("cvt.s16.u32 %0, %1;" : "=r"(v) : "r"(a)); return v; }
+__device__ __inline__ U32 slct (U32 a, U32 b, S32 c) { U32 v; asm("slct.u32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32 slct (S32 a, S32 b, S32 c) { S32 v; asm("slct.s32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ F32 slct (F32 a, F32 b, S32 c) { F32 v; asm("slct.f32.s32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "r"(c)); return v; }
+__device__ __inline__ U32 isetge (S32 a, S32 b) { U32 v; asm("set.ge.u32.s32 %0, %1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ F64 rcp_approx (F64 a) { F64 v; asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(v) : "d"(a)); return v; }
+__device__ __inline__ F32 fma_rm (F32 a, F32 b, F32 c) { F32 v; asm("fma.rm.f32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "f"(c)); return v; }
+__device__ __inline__ U32 idiv_fast (U32 a, U32 b);
+
+__device__ __inline__ uint3 setupPleq (float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp);
+
+__device__ __inline__ void cover8x8_setupLUT (volatile U64* lut);
+__device__ __inline__ U64 cover8x8_exact_fast (S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut); // Assumes viewport <= 2^11, subpixels <= 2^4, no guardband.
+__device__ __inline__ U64 cover8x8_lookupMask (S64 yinit, U32 yinc, U32 flips, volatile const U64* lut);
+
+__device__ __inline__ U64 cover8x8_exact_noLUT (S32 ox, S32 oy, S32 dx, S32 dy); // optimized reference implementation, does not require look-up table
+__device__ __inline__ U64 cover8x8_conservative_noLUT (S32 ox, S32 oy, S32 dx, S32 dy);
+__device__ __inline__ U64 cover8x8_generateMask_noLUT (S32 curr, S32 dx, S32 dy);
+
+template __device__ __inline__ void sortShared(T* ptr, int numItems); // Assumes that numItems <= threadsInBlock. Must sync before & after the call.
+
+__device__ __inline__ const CRImageParams& getImageParams(const CRParams& p, int idx)
+{
+ return (idx < CR_EMBED_IMAGE_PARAMS) ? p.imageParamsFirst[idx] : p.imageParamsExtra[idx - CR_EMBED_IMAGE_PARAMS];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipPolygonWithPlane(F32* baryOut, const F32* baryIn, int numIn, F32 v0, F32 v1, F32 v2)
+{
+ int numOut = 0;
+ if (numIn >= 3)
+ {
+ int ai = (numIn - 1) * 2;
+ F32 av = v0 + v1 * baryIn[ai + 0] + v2 * baryIn[ai + 1];
+ for (int bi = 0; bi < numIn * 2; bi += 2)
+ {
+ F32 bv = v0 + v1 * baryIn[bi + 0] + v2 * baryIn[bi + 1];
+ if (av * bv < 0.0f)
+ {
+ F32 bc = av / (av - bv);
+ F32 ac = 1.0f - bc;
+ baryOut[numOut + 0] = baryIn[ai + 0] * ac + baryIn[bi + 0] * bc;
+ baryOut[numOut + 1] = baryIn[ai + 1] * ac + baryIn[bi + 1] * bc;
+ numOut += 2;
+ }
+ if (bv >= 0.0f)
+ {
+ baryOut[numOut + 0] = baryIn[bi + 0];
+ baryOut[numOut + 1] = baryIn[bi + 1];
+ numOut += 2;
+ }
+ ai = bi;
+ av = bv;
+ }
+ }
+ return (numOut >> 1);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipTriangleWithFrustum(F32* bary, const F32* v0, const F32* v1, const F32* v2, const F32* d1, const F32* d2)
+{
+ int num = 3;
+ bary[0] = 0.0f, bary[1] = 0.0f;
+ bary[2] = 1.0f, bary[3] = 0.0f;
+ bary[4] = 0.0f, bary[5] = 1.0f;
+
+ if ((v0[3] < fabsf(v0[0])) | (v1[3] < fabsf(v1[0])) | (v2[3] < fabsf(v2[0])))
+ {
+ F32 temp[18];
+ num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[0], d1[3] + d1[0], d2[3] + d2[0]);
+ num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[0], d1[3] - d1[0], d2[3] - d2[0]);
+ }
+ if ((v0[3] < fabsf(v0[1])) | (v1[3] < fabsf(v1[1])) | (v2[3] < fabsf(v2[1])))
+ {
+ F32 temp[18];
+ num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[1], d1[3] + d1[1], d2[3] + d2[1]);
+ num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[1], d1[3] - d1[1], d2[3] - d2[1]);
+ }
+ if ((v0[3] < fabsf(v0[2])) | (v1[3] < fabsf(v1[2])) | (v2[3] < fabsf(v2[2])))
+ {
+ F32 temp[18];
+ num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[2], d1[3] + d1[2], d2[3] + d2[2]);
+ num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[2], d1[3] - d1[2], d2[3] - d2[2]);
+ }
+ return num;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 idiv_fast(U32 a, U32 b)
+{
+ return f32_to_u32_sat_rmi(((F32)a + 0.5f) / (F32)b);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 toABGR(float4 color)
+{
+ // 11 instructions: 4*FFMA, 4*F2I, 3*PRMT
+ U32 x = f32_to_u32_sat_rmi(fma_rm(color.x, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+ U32 y = f32_to_u32_sat_rmi(fma_rm(color.y, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+ U32 z = f32_to_u32_sat_rmi(fma_rm(color.z, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+ U32 w = f32_to_u32_sat_rmi(fma_rm(color.w, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+ return prmt(prmt(x, y, 0x0073), prmt(z, w, 0x0073), 0x5410);
+}
+
+//------------------------------------------------------------------------
+// v0 = subpixels relative to the bottom-left sampling point
+
+__device__ __inline__ uint3 setupPleq(float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp)
+{
+ F32 mx = fmaxf(fmaxf(values.x, values.y), values.z);
+ int sh = ::min(::max((__float_as_int(mx) >> 23) - (127 + 22), 0), 8);
+ S32 t0 = (U32)values.x >> sh;
+ S32 t1 = ((U32)values.y >> sh) - t0;
+ S32 t2 = ((U32)values.z >> sh) - t0;
+
+ U32 rcpMant = (__float_as_int(areaRcp) & 0x007FFFFF) | 0x00800000;
+ int rcpShift = (23 + 127) - (__float_as_int(areaRcp) >> 23);
+
+ uint3 pleq;
+ S64 xc = ((S64)t1 * d2.y - (S64)t2 * d1.y) * rcpMant;
+ S64 yc = ((S64)t2 * d1.x - (S64)t1 * d2.x) * rcpMant;
+ pleq.x = (U32)(xc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+ pleq.y = (U32)(yc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+
+ S32 centerX = (v0.x * 2 + min_min(d1.x, d2.x, 0) + max_max(d1.x, d2.x, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+ S32 centerY = (v0.y * 2 + min_min(d1.y, d2.y, 0) + max_max(d1.y, d2.y, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+ S32 vcx = v0.x - (centerX << CR_SUBPIXEL_LOG2);
+ S32 vcy = v0.y - (centerY << CR_SUBPIXEL_LOG2);
+
+ pleq.z = t0 << sh;
+ pleq.z -= (U32)(((xc >> 13) * vcx + (yc >> 13) * vcy) >> (rcpShift - (sh + 13)));
+ pleq.z -= pleq.x * centerX + pleq.y * centerY;
+ return pleq;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void cover8x8_setupLUT(volatile U64* lut)
+{
+ for (S32 lutIdx = threadIdx.x + blockDim.x * threadIdx.y; lutIdx < CR_COVER8X8_LUT_SIZE; lutIdx += blockDim.x * blockDim.y)
+ {
+ int half = (lutIdx < (12 << 5)) ? 0 : 1;
+ int yint = (lutIdx >> 5) - half * 12 - 3;
+ U32 shape = ((lutIdx >> 2) & 7) << (31 - 2);
+ S32 slctSwapXY = lutIdx << (31 - 1);
+ S32 slctNegX = lutIdx << (31 - 0);
+ S32 slctCompl = slctSwapXY ^ slctNegX;
+
+ U64 mask = 0;
+ int xlo = half * 4;
+ int xhi = xlo + 4;
+ for (int x = xlo; x < xhi; x++)
+ {
+ int ylo = slct(0, ::max(yint, 0), slctCompl);
+ int yhi = slct(::min(yint, 8), 8, slctCompl);
+ for (int y = ylo; y < yhi; y++)
+ {
+ int xx = slct(x, y, slctSwapXY);
+ int yy = slct(y, x, slctSwapXY);
+ xx = slct(xx, 7 - xx, slctNegX);
+ mask |= (U64)1 << (xx + yy * 8);
+ }
+ yint += shape >> 31;
+ shape <<= 1;
+ }
+ lut[lutIdx] = mask;
+ }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_fast(S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut) // 52 instr
+{
+ F32 yinitBias = (F32)(1 << (31 - CR_MAXVIEWPORT_LOG2 - CR_SUBPIXEL_LOG2 * 2));
+ F32 yinitScale = (F32)(1 << (32 - CR_SUBPIXEL_LOG2));
+ F32 yincScale = 65536.0f * 65536.0f;
+
+ S32 slctFlipY = flips << (31 - CR_FLIPBIT_FLIP_Y);
+ S32 slctFlipX = flips << (31 - CR_FLIPBIT_FLIP_X);
+ S32 slctSwapXY = flips << (31 - CR_FLIPBIT_SWAP_XY);
+
+ // Evaluate cross product.
+
+ S32 t = ox * dy - oy * dx;
+ F32 det = (F32)slct(t, t - dy * (7 << CR_SUBPIXEL_LOG2), slctFlipX);
+ if (flips >= (1 << CR_FLIPBIT_COMPL))
+ det = -det;
+
+ // Represent Y as a function of X.
+
+ F32 xrcp = 1.0f / (F32)::abs(slct(dx, dy, slctSwapXY));
+ F32 yzero = det * yinitScale * xrcp + yinitBias;
+ S64 yinit = f32_to_s64(slct(yzero, -yzero, slctFlipY));
+ U32 yinc = f32_to_u32_sat((F32)::abs(slct(dy, dx, slctSwapXY)) * xrcp * yincScale);
+
+ // Lookup.
+
+ return cover8x8_lookupMask(yinit, yinc, flips, lut);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_lookupMask(S64 yinit, U32 yinc, U32 flips, volatile const U64* lut)
+{
+ // First half.
+
+ U32 yfrac = getLo(yinit);
+ U32 shape = add_clamp_0_x(getHi(yinit) + 4, 0, 11);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ int oct = flips & ((1 << CR_FLIPBIT_FLIP_X) | (1 << CR_FLIPBIT_SWAP_XY));
+ U64 mask = *(U64*)((U8*)lut + oct + (shape << 5));
+
+ // Second half.
+
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ shape = add_clamp_0_x(getHi(yinit) + 4, __popc(shape & 15), 11);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+ mask |= *(U64*)((U8*)lut + oct + (shape << 5) + (12 << 8));
+ return (flips >= (1 << CR_FLIPBIT_COMPL)) ? ~mask : mask;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+ S32 curr = ox * dy - oy * dx;
+ if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+ return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_conservative_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+ S32 curr = ox * dy - oy * dx;
+ if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+ curr += (::abs(dx) + ::abs(dy)) << (CR_SUBPIXEL_LOG2 - 1);
+ return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_generateMask_noLUT(S32 curr, S32 dx, S32 dy)
+{
+ curr += (dx - dy) * (7 << CR_SUBPIXEL_LOG2);
+ S32 stepX = dy << (CR_SUBPIXEL_LOG2 + 1);
+ S32 stepYorig = -dx - dy * 7;
+ S32 stepY = stepYorig << (CR_SUBPIXEL_LOG2 + 1);
+
+ U32 hi = isetge(curr, 0);
+ U32 frac = curr + curr;
+ for (int i = 62; i >= 32; i--)
+ add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, hi, hi, hi);
+
+ U32 lo = 0;
+ for (int i = 31; i >= 0; i--)
+ add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, lo, lo, lo);
+
+ lo ^= lo >> 1, hi ^= hi >> 1;
+ lo ^= lo >> 2, hi ^= hi >> 2;
+ lo ^= lo >> 4, hi ^= hi >> 4;
+ lo ^= lo >> 8, hi ^= hi >> 8;
+ lo ^= lo >> 16, hi ^= hi >> 16;
+
+ if (dy < 0)
+ {
+ lo ^= 0x55AA55AA;
+ hi ^= 0x55AA55AA;
+ }
+ if (stepYorig < 0)
+ {
+ lo ^= 0xFF00FF00;
+ hi ^= 0x00FF00FF;
+ }
+ if ((hi & 1) != 0)
+ lo = ~lo;
+
+ return combineLoHi(lo, hi);
+}
+
+//------------------------------------------------------------------------
+
+template __device__ __inline__ void sortShared(T* ptr, int numItems)
+{
+ int thrInBlock = threadIdx.x + threadIdx.y * blockDim.x;
+ int range = 16;
+
+ // Use transposition sort within each 16-wide subrange.
+
+ int base = thrInBlock * 2;
+ bool act = (base < numItems - 1);
+ U32 actMask = __ballot_sync(~0u, act);
+ if (act)
+ {
+ bool tryOdd = (base < numItems - 2 && (~base & (range - 2)) != 0);
+ T mid = ptr[base + 1];
+
+ for (int iter = 0; iter < range; iter += 2)
+ {
+ // Evens.
+
+ T tmp = ptr[base + 0];
+ if (tmp > mid)
+ {
+ ptr[base + 0] = mid;
+ mid = tmp;
+ }
+ __syncwarp(actMask);
+
+ // Odds.
+
+ if (tryOdd)
+ {
+ tmp = ptr[base + 2];
+ if (mid > tmp)
+ {
+ ptr[base + 2] = mid;
+ mid = tmp;
+ }
+ }
+ __syncwarp(actMask);
+ }
+ ptr[base + 1] = mid;
+ }
+
+ // Multiple subranges => Merge hierarchically.
+
+ for (; range < numItems; range <<= 1)
+ {
+ // Assuming that we would insert the current item into the other
+ // subrange, use binary search to find the appropriate slot.
+
+ __syncthreads();
+
+ T item;
+ int slot;
+ if (thrInBlock < numItems)
+ {
+ item = ptr[thrInBlock];
+ slot = (thrInBlock & -range) ^ range;
+ if (slot < numItems)
+ {
+ T tmp = ptr[slot];
+ bool inclusive = ((thrInBlock & range) != 0);
+ if (tmp < item || (inclusive && tmp == item))
+ {
+ for (int step = (range >> 1); step != 0; step >>= 1)
+ {
+ int probe = slot + step;
+ if (probe < numItems)
+ {
+ tmp = ptr[probe];
+ if (tmp < item || (inclusive && tmp == item))
+ slot = probe;
+ }
+ }
+ slot++;
+ }
+ }
+ }
+
+ // Store the item at an appropriate place.
+
+ __syncthreads();
+
+ if (thrInBlock < numItems)
+ ptr[slot + (thrInBlock & (range * 2 - 1)) - range] = item;
+ }
+}
+
+//------------------------------------------------------------------------
+}
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/framework.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/framework.h
new file mode 100644
index 0000000..12d803c
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/framework.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include
+#include
+#include
+#include
+#include
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.cpp
new file mode 100644
index 0000000..2af3e93
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+#include "framework.h"
+#include "glutil.h"
+#include
+#include
+
+// Create the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+// Track initialization status.
+static volatile bool s_glExtInitialized = false;
+
+// Error strings.
+const char* getGLErrorString(GLenum err)
+{
+ switch(err)
+ {
+ case GL_NO_ERROR: return "GL_NO_ERROR";
+ case GL_INVALID_ENUM: return "GL_INVALID_ENUM";
+ case GL_INVALID_VALUE: return "GL_INVALID_VALUE";
+ case GL_INVALID_OPERATION: return "GL_INVALID_OPERATION";
+ case GL_STACK_OVERFLOW: return "GL_STACK_OVERFLOW";
+ case GL_STACK_UNDERFLOW: return "GL_STACK_UNDERFLOW";
+ case GL_OUT_OF_MEMORY: return "GL_OUT_OF_MEMORY";
+ case GL_INVALID_FRAMEBUFFER_OPERATION: return "GL_INVALID_FRAMEBUFFER_OPERATION";
+ case GL_TABLE_TOO_LARGE: return "GL_TABLE_TOO_LARGE";
+ case GL_CONTEXT_LOST: return "GL_CONTEXT_LOST";
+ }
+ return "Unknown error";
+}
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+static CRITICAL_SECTION getInitializedCriticalSection(void)
+{
+ CRITICAL_SECTION cs;
+ InitializeCriticalSection(&cs);
+ return cs;
+}
+
+static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
+
+static void safeGetProcAddress(const char* name, PROC* pfn)
+{
+ PROC result = wglGetProcAddress(name);
+ if (!result)
+ {
+ LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
+ LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+ exit(1); // Should never get here but make sure we exit.
+ }
+ *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+ // Use critical section for thread safety.
+ EnterCriticalSection(&s_getProcAddressMutex);
+
+ // Only dig function pointers if not done already.
+ if (!s_glExtInitialized)
+ {
+ // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+ // Mark as initialized.
+ s_glExtInitialized = true;
+ }
+
+ // Done.
+ LeaveCriticalSection(&s_getProcAddressMutex);
+ return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+ if (!glctx.hglrc)
+ LOG(FATAL) << "setGLContext() called with null gltcx";
+ if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+ LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
+
+ if (glctx.extInitialized)
+ return;
+ initializeGLExtensions();
+ glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+ if (!wglMakeCurrent(NULL, NULL))
+ LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+extern "C" int set_gpu(const char*); // In setgpu.lib
+GLContext createGLContext(int cudaDeviceIdx)
+{
+ if (cudaDeviceIdx >= 0)
+ {
+ char pciBusId[256] = "";
+ LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+ if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
+ {
+ LOG(INFO) << "PCI bus id query failed";
+ }
+ else
+ {
+ int res = set_gpu(pciBusId);
+ LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+ }
+ }
+
+ HINSTANCE hInstance = GetModuleHandle(NULL);
+ WNDCLASS wc = {};
+ wc.style = CS_OWNDC;
+ wc.lpfnWndProc = DefWindowProc;
+ wc.hInstance = hInstance;
+ wc.lpszClassName = "__DummyGLClassCPP";
+ int res = RegisterClass(&wc);
+
+ HWND hwnd = CreateWindow(
+ "__DummyGLClassCPP", // lpClassName
+ "__DummyGLWindowCPP", // lpWindowName
+ WS_OVERLAPPEDWINDOW, // dwStyle
+ CW_USEDEFAULT, // x
+ CW_USEDEFAULT, // y
+ 0, 0, // nWidth, nHeight
+ NULL, NULL, // hWndParent, hMenu
+ hInstance, // hInstance
+ NULL // lpParam
+ );
+
+ PIXELFORMATDESCRIPTOR pfd = {};
+ pfd.dwFlags = PFD_SUPPORT_OPENGL;
+ pfd.iPixelType = PFD_TYPE_RGBA;
+ pfd.iLayerType = PFD_MAIN_PLANE;
+ pfd.cColorBits = 32;
+ pfd.cDepthBits = 24;
+ pfd.cStencilBits = 8;
+
+ HDC hdc = GetDC(hwnd);
+ int pixelformat = ChoosePixelFormat(hdc, &pfd);
+ SetPixelFormat(hdc, pixelformat, &pfd);
+
+ HGLRC hglrc = wglCreateContext(hdc);
+ LOG(INFO) << std::hex << std::setfill('0')
+ << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+ << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+ GLContext glctx = {hdc, hglrc, 0};
+ return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+ if (!glctx.hglrc)
+ LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+ // If this is the current context, release it.
+ if (wglGetCurrentContext() == glctx.hglrc)
+ releaseGLContext();
+
+ HWND hwnd = WindowFromDC(glctx.hdc);
+ if (!hwnd)
+ LOG(FATAL) << "WindowFromDC() failed";
+ if (!ReleaseDC(hwnd, glctx.hdc))
+ LOG(FATAL) << "ReleaseDC() failed";
+ if (!wglDeleteContext(glctx.hglrc))
+ LOG(FATAL) << "wglDeleteContext() failed";
+ if (!DestroyWindow(hwnd))
+ LOG(FATAL) << "DestroyWindow() failed";
+
+ LOG(INFO) << std::hex << std::setfill('0')
+ << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+ << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+ memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+
+static pthread_mutex_t s_getProcAddressMutex;
+
+typedef void (*PROCFN)();
+
+static void safeGetProcAddress(const char* name, PROCFN* pfn)
+{
+ PROCFN result = eglGetProcAddress(name);
+ if (!result)
+ {
+ pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
+ LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+ exit(1); // Should never get here but make sure we exit.
+ }
+ *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+ pthread_mutex_lock(&s_getProcAddressMutex);
+
+ // Only dig function pointers if not done already.
+ if (!s_glExtInitialized)
+ {
+ // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+ // Mark as initialized.
+ s_glExtInitialized = true;
+ }
+
+ pthread_mutex_unlock(&s_getProcAddressMutex);
+ return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+ if (!glctx.context)
+ LOG(FATAL) << "setGLContext() called with null gltcx";
+
+ if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
+ LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+ if (glctx.extInitialized)
+ return;
+ initializeGLExtensions();
+ glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+ EGLDisplay display = eglGetCurrentDisplay();
+ if (display == EGL_NO_DISPLAY)
+ LOG(WARNING) << "releaseGLContext() called with no active display";
+ if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+ LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
+{
+ typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+ typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+ typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
+
+ eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+ if (!eglQueryDevicesEXT)
+ {
+ LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+ return 0;
+ }
+
+ eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+ if (!eglQueryDeviceAttribEXT)
+ {
+ LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+ return 0;
+ }
+
+ eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+ if (!eglGetPlatformDisplayEXT)
+ {
+ LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+ return 0;
+ }
+
+ int num_devices = 0;
+ eglQueryDevicesEXT(0, 0, &num_devices);
+ if (!num_devices)
+ return 0;
+
+ EGLDisplay display = 0;
+ EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+ eglQueryDevicesEXT(num_devices, devices, &num_devices);
+ for (int i=0; i < num_devices; i++)
+ {
+ EGLDeviceEXT device = devices[i];
+ intptr_t value = -1;
+ if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+ {
+ display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+ break;
+ }
+ }
+
+ free(devices);
+ return display;
+}
+
+GLContext createGLContext(int cudaDeviceIdx)
+{
+ EGLDisplay display = 0;
+
+ if (cudaDeviceIdx >= 0)
+ {
+ char pciBusId[256] = "";
+ LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+ display = getCudaDisplay(cudaDeviceIdx);
+ if (!display)
+ LOG(INFO) << "Failed, falling back to default display";
+ }
+
+ if (!display)
+ {
+ display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+ if (display == EGL_NO_DISPLAY)
+ LOG(FATAL) << "eglGetDisplay() failed";
+ }
+
+ EGLint major;
+ EGLint minor;
+ if (!eglInitialize(display, &major, &minor))
+ LOG(FATAL) << "eglInitialize() failed";
+
+ // Choose configuration.
+
+ const EGLint context_attribs[] = {
+ EGL_RED_SIZE, 8,
+ EGL_GREEN_SIZE, 8,
+ EGL_BLUE_SIZE, 8,
+ EGL_ALPHA_SIZE, 8,
+ EGL_DEPTH_SIZE, 24,
+ EGL_STENCIL_SIZE, 8,
+ EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
+ EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
+ EGL_NONE
+ };
+
+ EGLConfig config;
+ EGLint num_config;
+ if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+ LOG(FATAL) << "eglChooseConfig() failed";
+
+ // Create GL context.
+
+ if (!eglBindAPI(EGL_OPENGL_API))
+ LOG(FATAL) << "eglBindAPI() failed";
+
+ EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+ if (context == EGL_NO_CONTEXT)
+ LOG(FATAL) << "eglCreateContext() failed";
+
+ // Done.
+
+ LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+ << std::hex << std::setfill('0')
+ << std::setw(16) << (uintptr_t)display
+ << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+ GLContext glctx = {display, context, 0};
+ return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+ if (!glctx.context)
+ LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+ // If this is the current context, release it.
+ if (eglGetCurrentContext() == glctx.context)
+ releaseGLContext();
+
+ if (!eglDestroyContext(glctx.display, glctx.context))
+ LOG(ERROR) << "eglDestroyContext() failed";
+
+ LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+ << std::hex << std::setfill('0')
+ << std::setw(16) << (uintptr_t)glctx.display
+ << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+ memset(&glctx, 0, sizeof(GLContext));
+}
+
+//------------------------------------------------------------------------
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.h
new file mode 100644
index 0000000..e9a3a7d
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Windows-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include // Required by gl.h in Windows.
+#define GLAPIENTRY APIENTRY
+
+struct GLContext
+{
+ HDC hdc;
+ HGLRC hglrc;
+ int extInitialized;
+};
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#include
+#include
+#define GLAPIENTRY
+
+struct GLContext
+{
+ EGLDisplay display;
+ EGLContext context;
+ int extInitialized;
+};
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// OpenGL, CUDA interop, GL extensions.
+//------------------------------------------------------------------------
+#define GL_GLEXT_LEGACY
+#include
+#include
+
+// Constants.
+#ifndef GL_VERSION_1_2
+#define GL_CLAMP_TO_EDGE 0x812F
+#define GL_TEXTURE_3D 0x806F
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_ARRAY_BUFFER 0x8892
+#define GL_DYNAMIC_DRAW 0x88E8
+#define GL_ELEMENT_ARRAY_BUFFER 0x8893
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_FRAGMENT_SHADER 0x8B30
+#define GL_INFO_LOG_LENGTH 0x8B84
+#define GL_LINK_STATUS 0x8B82
+#define GL_VERTEX_SHADER 0x8B31
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_MAJOR_VERSION 0x821B
+#define GL_MINOR_VERSION 0x821C
+#define GL_RGBA32F 0x8814
+#define GL_TEXTURE_2D_ARRAY 0x8C1A
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_GEOMETRY_SHADER 0x8DD9
+#endif
+#ifndef GL_ARB_framebuffer_object
+#define GL_COLOR_ATTACHMENT0 0x8CE0
+#define GL_COLOR_ATTACHMENT1 0x8CE1
+#define GL_DEPTH_STENCIL 0x84F9
+#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A
+#define GL_DEPTH24_STENCIL8 0x88F0
+#define GL_FRAMEBUFFER 0x8D40
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_UNSIGNED_INT_24_8 0x84FA
+#endif
+#ifndef GL_ARB_imaging
+#define GL_TABLE_TOO_LARGE 0x8031
+#endif
+#ifndef GL_KHR_robustness
+#define GL_CONTEXT_LOST 0x0507
+#endif
+
+// Declare function pointers to OpenGL extension functions.
+#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+//------------------------------------------------------------------------
+// Common functions.
+//------------------------------------------------------------------------
+
+void setGLContext (GLContext& glctx);
+void releaseGLContext (void);
+GLContext createGLContext (int cudaDeviceIdx);
+void destroyGLContext (GLContext& glctx);
+const char* getGLErrorString (GLenum err);
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil_extlist.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil_extlist.h
new file mode 100644
index 0000000..afa08f3
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/glutil_extlist.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#ifndef GL_VERSION_1_2
+GLUTIL_EXT(void, glTexImage3D, GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+#endif
+#ifndef GL_VERSION_1_5
+GLUTIL_EXT(void, glBindBuffer, GLenum target, GLuint buffer);
+GLUTIL_EXT(void, glBufferData, GLenum target, ptrdiff_t size, const void* data, GLenum usage);
+GLUTIL_EXT(void, glGenBuffers, GLsizei n, GLuint* buffers);
+#endif
+#ifndef GL_VERSION_2_0
+GLUTIL_EXT(void, glAttachShader, GLuint program, GLuint shader);
+GLUTIL_EXT(void, glCompileShader, GLuint shader);
+GLUTIL_EXT(GLuint, glCreateProgram, void);
+GLUTIL_EXT(GLuint, glCreateShader, GLenum type);
+GLUTIL_EXT(void, glDrawBuffers, GLsizei n, const GLenum* bufs);
+GLUTIL_EXT(void, glEnableVertexAttribArray, GLuint index);
+GLUTIL_EXT(void, glGetProgramInfoLog, GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
+GLUTIL_EXT(void, glGetProgramiv, GLuint program, GLenum pname, GLint* param);
+GLUTIL_EXT(void, glLinkProgram, GLuint program);
+GLUTIL_EXT(void, glShaderSource, GLuint shader, GLsizei count, const char *const* string, const GLint* length);
+GLUTIL_EXT(void, glUniform1f, GLint location, GLfloat v0);
+GLUTIL_EXT(void, glUniform2f, GLint location, GLfloat v0, GLfloat v1);
+GLUTIL_EXT(void, glUseProgram, GLuint program);
+GLUTIL_EXT(void, glVertexAttribPointer, GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
+#endif
+#ifndef GL_VERSION_3_2
+GLUTIL_EXT(void, glFramebufferTexture, GLenum target, GLenum attachment, GLuint texture, GLint level);
+#endif
+#ifndef GL_ARB_framebuffer_object
+GLUTIL_EXT(void, glBindFramebuffer, GLenum target, GLuint framebuffer);
+GLUTIL_EXT(void, glGenFramebuffers, GLsizei n, GLuint* framebuffers);
+#endif
+#ifndef GL_ARB_vertex_array_object
+GLUTIL_EXT(void, glBindVertexArray, GLuint array);
+GLUTIL_EXT(void, glGenVertexArrays, GLsizei n, GLuint* arrays);
+#endif
+#ifndef GL_ARB_multi_draw_indirect
+GLUTIL_EXT(void, glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
+#endif
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.cu
new file mode 100644
index 0000000..3bd2a7a
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.width || py >= p.height || pz >= p.depth)
+ return;
+
+ // Pixel index.
+ int pidx = px + p.width * (py + p.height * pz);
+
+ // Output ptrs.
+ float* out = p.out + pidx * p.numAttr;
+ float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+ // Fetch rasterizer output.
+ float4 r = ((float4*)p.rast)[pidx];
+ int triIdx = float_to_triidx(r.w) - 1;
+ bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+ // If no geometry in entire warp, zero the output and exit.
+ // Otherwise force barys to zero and output with live threads.
+ if (__all_sync(0xffffffffu, !triValid))
+ {
+ for (int i=0; i < p.numAttr; i++)
+ out[i] = 0.f;
+ if (ENABLE_DA)
+ for (int i=0; i < p.numDiffAttr; i++)
+ outDA[i] = make_float2(0.f, 0.f);
+ return;
+ }
+
+ // Fetch vertex indices.
+ int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+ int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+ int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+ // Bail out if corrupt indices.
+ if (vi0 < 0 || vi0 >= p.numVertices ||
+ vi1 < 0 || vi1 >= p.numVertices ||
+ vi2 < 0 || vi2 >= p.numVertices)
+ return;
+
+ // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+ if (p.instance_mode && !p.attrBC)
+ {
+ vi0 += pz * p.numVertices;
+ vi1 += pz * p.numVertices;
+ vi2 += pz * p.numVertices;
+ }
+
+ // Pointers to attributes.
+ const float* a0 = p.attr + vi0 * p.numAttr;
+ const float* a1 = p.attr + vi1 * p.numAttr;
+ const float* a2 = p.attr + vi2 * p.numAttr;
+
+ // Barys. If no triangle, force all to zero -> output is zero.
+ float b0 = triValid ? r.x : 0.f;
+ float b1 = triValid ? r.y : 0.f;
+ float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+ // Interpolate and write attributes.
+ for (int i=0; i < p.numAttr; i++)
+ out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+ // No diff attrs? Exit.
+ if (!ENABLE_DA)
+ return;
+
+ // Read bary pixel differentials if we have a triangle.
+ float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+ if (triValid)
+ db = ((float4*)p.rastDB)[pidx];
+
+ // Unpack a bit.
+ float dudx = db.x;
+ float dudy = db.y;
+ float dvdx = db.z;
+ float dvdy = db.w;
+
+ // Calculate the pixel differentials of chosen attributes.
+ for (int i=0; i < p.numDiffAttr; i++)
+ {
+ // Input attribute index.
+ int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+ if (j < 0)
+ j += p.numAttr; // Python-style negative indices.
+
+ // Zero output if invalid index.
+ float dsdx = 0.f;
+ float dsdy = 0.f;
+ if (j >= 0 && j < p.numAttr)
+ {
+ float s0 = a0[j];
+ float s1 = a1[j];
+ float s2 = a2[j];
+ float dsdu = s0 - s2;
+ float dsdv = s1 - s2;
+ dsdx = dudx*dsdu + dvdx*dsdv;
+ dsdy = dudy*dsdu + dvdy*dsdv;
+ }
+
+ // Write.
+ outDA[i] = make_float2(dsdx, dsdy);
+ }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+ // Temporary space for coalesced atomics.
+ CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.width || py >= p.height || pz >= p.depth)
+ return;
+
+ // Pixel index.
+ int pidx = px + p.width * (py + p.height * pz);
+
+ // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+ float4 r = ((float4*)p.rast)[pidx];
+ int triIdx = float_to_triidx(r.w) - 1;
+ if (triIdx < 0 || triIdx >= p.numTriangles)
+ {
+ ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+ if (ENABLE_DA)
+ ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+ return;
+ }
+
+ // Fetch vertex indices.
+ int vi0 = p.tri[triIdx * 3 + 0];
+ int vi1 = p.tri[triIdx * 3 + 1];
+ int vi2 = p.tri[triIdx * 3 + 2];
+
+ // Bail out if corrupt indices.
+ if (vi0 < 0 || vi0 >= p.numVertices ||
+ vi1 < 0 || vi1 >= p.numVertices ||
+ vi2 < 0 || vi2 >= p.numVertices)
+ return;
+
+ // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+ if (p.instance_mode && !p.attrBC)
+ {
+ vi0 += pz * p.numVertices;
+ vi1 += pz * p.numVertices;
+ vi2 += pz * p.numVertices;
+ }
+
+ // Initialize coalesced atomics.
+ CA_SET_GROUP(triIdx);
+
+ // Pointers to inputs.
+ const float* a0 = p.attr + vi0 * p.numAttr;
+ const float* a1 = p.attr + vi1 * p.numAttr;
+ const float* a2 = p.attr + vi2 * p.numAttr;
+ const float* pdy = p.dy + pidx * p.numAttr;
+
+ // Pointers to outputs.
+ float* ga0 = p.gradAttr + vi0 * p.numAttr;
+ float* ga1 = p.gradAttr + vi1 * p.numAttr;
+ float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+ // Barys and bary gradient accumulators.
+ float b0 = r.x;
+ float b1 = r.y;
+ float b2 = 1.f - r.x - r.y;
+ float gb0 = 0.f;
+ float gb1 = 0.f;
+
+ // Loop over attributes and accumulate attribute gradients.
+ for (int i=0; i < p.numAttr; i++)
+ {
+ float y = pdy[i];
+ float s0 = a0[i];
+ float s1 = a1[i];
+ float s2 = a2[i];
+ gb0 += y * (s0 - s2);
+ gb1 += y * (s1 - s2);
+ caAtomicAdd(ga0 + i, b0 * y);
+ caAtomicAdd(ga1 + i, b1 * y);
+ caAtomicAdd(ga2 + i, b2 * y);
+ }
+
+ // Write the bary gradients.
+ ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+ // If pixel differentials disabled, we're done.
+ if (!ENABLE_DA)
+ return;
+
+ // Calculate gradients based on attribute pixel differentials.
+ const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+ float gdudx = 0.f;
+ float gdudy = 0.f;
+ float gdvdx = 0.f;
+ float gdvdy = 0.f;
+
+ // Read bary pixel differentials.
+ float4 db = ((float4*)p.rastDB)[pidx];
+ float dudx = db.x;
+ float dudy = db.y;
+ float dvdx = db.z;
+ float dvdy = db.w;
+
+ for (int i=0; i < p.numDiffAttr; i++)
+ {
+ // Input attribute index.
+ int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+ if (j < 0)
+ j += p.numAttr; // Python-style negative indices.
+
+ // Check that index is valid.
+ if (j >= 0 && j < p.numAttr)
+ {
+ float2 dsdxy = dda[i];
+ float dsdx = dsdxy.x;
+ float dsdy = dsdxy.y;
+
+ float s0 = a0[j];
+ float s1 = a1[j];
+ float s2 = a2[j];
+
+ // Gradients of db.
+ float dsdu = s0 - s2;
+ float dsdv = s1 - s2;
+ gdudx += dsdu * dsdx;
+ gdudy += dsdu * dsdy;
+ gdvdx += dsdv * dsdx;
+ gdvdy += dsdv * dsdy;
+
+ // Gradients of attributes.
+ float du = dsdx*dudx + dsdy*dudy;
+ float dv = dsdx*dvdx + dsdy*dvdy;
+ caAtomicAdd(ga0 + j, du);
+ caAtomicAdd(ga1 + j, dv);
+ caAtomicAdd(ga2 + j, -du - dv);
+ }
+ }
+
+ // Write.
+ ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel (const InterpolateKernelParams p) { InterpolateGradKernelTemplate(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate(p); }
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.h
new file mode 100644
index 0000000..d35d838
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/interpolate.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH 8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS 32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+ const int* tri; // Incoming triangle buffer.
+ const float* attr; // Incoming attribute buffer.
+ const float* rast; // Incoming rasterizer output buffer.
+ const float* rastDB; // Incoming rasterizer output buffer for bary derivatives.
+ const float* dy; // Incoming attribute gradients.
+ const float* dda; // Incoming attr diff gradients.
+ float* out; // Outgoing interpolated attributes.
+ float* outDA; // Outgoing texcoord major axis lengths.
+ float* gradAttr; // Outgoing attribute gradients.
+ float* gradRaster; // Outgoing rasterizer gradients.
+ float* gradRasterDB; // Outgoing rasterizer bary diff gradients.
+ int numTriangles; // Number of triangles.
+ int numVertices; // Number of vertices.
+ int numAttr; // Number of total vertex attributes.
+ int numDiffAttr; // Number of attributes to differentiate.
+ int width; // Image width.
+ int height; // Image height.
+ int depth; // Minibatch size.
+ int attrBC; // 0=normal, 1=attr is broadcast.
+ int instance_mode; // 0=normal, 1=instance mode.
+ int diff_attrs_all; // 0=normal, 1=produce pixel differentials for all attributes.
+ int diffAttrs[IP_MAX_DIFF_ATTRS]; // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.cu
new file mode 100644
index 0000000..455aca3
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Cuda forward rasterizer pixel shader kernel.
+
+__global__ void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p)
+{
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.width_out || py >= p.height_out || pz >= p.depth)
+ return;
+
+ // Pixel indices.
+ int pidx_in = px + p.width_in * (py + p.height_in * pz);
+ int pidx_out = px + p.width_out * (py + p.height_out * pz);
+
+ // Fetch triangle idx.
+ int triIdx = p.in_idx[pidx_in] - 1;
+ if (triIdx < 0 || triIdx >= p.numTriangles)
+ {
+ // No or corrupt triangle.
+ ((float4*)p.out)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out.
+ ((float4*)p.out_db)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out_db.
+ return;
+ }
+
+ // Fetch vertex indices.
+ int vi0 = p.tri[triIdx * 3 + 0];
+ int vi1 = p.tri[triIdx * 3 + 1];
+ int vi2 = p.tri[triIdx * 3 + 2];
+
+ // Bail out if vertex indices are corrupt.
+ if (vi0 < 0 || vi0 >= p.numVertices ||
+ vi1 < 0 || vi1 >= p.numVertices ||
+ vi2 < 0 || vi2 >= p.numVertices)
+ return;
+
+ // In instance mode, adjust vertex indices by minibatch index.
+ if (p.instance_mode)
+ {
+ vi0 += pz * p.numVertices;
+ vi1 += pz * p.numVertices;
+ vi2 += pz * p.numVertices;
+ }
+
+ // Fetch vertex positions.
+ float4 p0 = ((float4*)p.pos)[vi0];
+ float4 p1 = ((float4*)p.pos)[vi1];
+ float4 p2 = ((float4*)p.pos)[vi2];
+
+ // Evaluate edge functions.
+ float fx = p.xs * (float)px + p.xo;
+ float fy = p.ys * (float)py + p.yo;
+ float p0x = p0.x - fx * p0.w;
+ float p0y = p0.y - fy * p0.w;
+ float p1x = p1.x - fx * p1.w;
+ float p1y = p1.y - fy * p1.w;
+ float p2x = p2.x - fx * p2.w;
+ float p2y = p2.y - fy * p2.w;
+ float a0 = p1x*p2y - p1y*p2x;
+ float a1 = p2x*p0y - p2y*p0x;
+ float a2 = p0x*p1y - p0y*p1x;
+
+ // Perspective correct, normalized barycentrics.
+ float iw = 1.f / (a0 + a1 + a2);
+ float b0 = a0 * iw;
+ float b1 = a1 * iw;
+
+ // Compute z/w for depth buffer.
+ float z = p0.z * a0 + p1.z * a1 + p2.z * a2;
+ float w = p0.w * a0 + p1.w * a1 + p2.w * a2;
+ float zw = z / w;
+
+ // Clamps to avoid NaNs.
+ b0 = __saturatef(b0); // Clamp to [+0.0, 1.0].
+ b1 = __saturatef(b1); // Clamp to [+0.0, 1.0].
+ zw = fmaxf(fminf(zw, 1.f), -1.f);
+
+ // Emit output.
+ ((float4*)p.out)[pidx_out] = make_float4(b0, b1, zw, triidx_to_float(triIdx + 1));
+
+ // Calculate bary pixel differentials.
+ float dfxdx = p.xs * iw;
+ float dfydy = p.ys * iw;
+ float da0dx = p2.y*p1.w - p1.y*p2.w;
+ float da0dy = p1.x*p2.w - p2.x*p1.w;
+ float da1dx = p0.y*p2.w - p2.y*p0.w;
+ float da1dy = p2.x*p0.w - p0.x*p2.w;
+ float da2dx = p1.y*p0.w - p0.y*p1.w;
+ float da2dy = p0.x*p1.w - p1.x*p0.w;
+ float datdx = da0dx + da1dx + da2dx;
+ float datdy = da0dy + da1dy + da2dy;
+ float dudx = dfxdx * (b0 * datdx - da0dx);
+ float dudy = dfydy * (b0 * datdy - da0dy);
+ float dvdx = dfxdx * (b1 * datdx - da1dx);
+ float dvdy = dfydy * (b1 * datdy - da1dy);
+
+ // Emit bary pixel differentials.
+ ((float4*)p.out_db)[pidx_out] = make_float4(dudx, dudy, dvdx, dvdy);
+}
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+ // Temporary space for coalesced atomics.
+ CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.width || py >= p.height || pz >= p.depth)
+ return;
+
+ // Pixel index.
+ int pidx = px + p.width * (py + p.height * pz);
+
+ // Read triangle idx and dy.
+ float2 dy = ((float2*)p.dy)[pidx * 2];
+ float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+ int triIdx = float_to_triidx(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+ // Exit if nothing to do.
+ if (triIdx < 0 || triIdx >= p.numTriangles)
+ return; // No or corrupt triangle.
+ int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+ int grad_all_ddb = 0;
+ if (ENABLE_DB)
+ grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+ if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+ return; // All incoming gradients are +0/-0.
+
+ // Fetch vertex indices.
+ int vi0 = p.tri[triIdx * 3 + 0];
+ int vi1 = p.tri[triIdx * 3 + 1];
+ int vi2 = p.tri[triIdx * 3 + 2];
+
+ // Bail out if vertex indices are corrupt.
+ if (vi0 < 0 || vi0 >= p.numVertices ||
+ vi1 < 0 || vi1 >= p.numVertices ||
+ vi2 < 0 || vi2 >= p.numVertices)
+ return;
+
+ // In instance mode, adjust vertex indices by minibatch index.
+ if (p.instance_mode)
+ {
+ vi0 += pz * p.numVertices;
+ vi1 += pz * p.numVertices;
+ vi2 += pz * p.numVertices;
+ }
+
+ // Initialize coalesced atomics.
+ CA_SET_GROUP(triIdx);
+
+ // Fetch vertex positions.
+ float4 p0 = ((float4*)p.pos)[vi0];
+ float4 p1 = ((float4*)p.pos)[vi1];
+ float4 p2 = ((float4*)p.pos)[vi2];
+
+ // Evaluate edge functions.
+ float fx = p.xs * (float)px + p.xo;
+ float fy = p.ys * (float)py + p.yo;
+ float p0x = p0.x - fx * p0.w;
+ float p0y = p0.y - fy * p0.w;
+ float p1x = p1.x - fx * p1.w;
+ float p1y = p1.y - fy * p1.w;
+ float p2x = p2.x - fx * p2.w;
+ float p2y = p2.y - fy * p2.w;
+ float a0 = p1x*p2y - p1y*p2x;
+ float a1 = p2x*p0y - p2y*p0x;
+ float a2 = p0x*p1y - p0y*p1x;
+
+ // Compute inverse area with epsilon.
+ float at = a0 + a1 + a2;
+ float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+ float iw = 1.f / (at + ep);
+
+ // Perspective correct, normalized barycentrics.
+ float b0 = a0 * iw;
+ float b1 = a1 * iw;
+
+ // Position gradients.
+ float gb0 = dy.x * iw;
+ float gb1 = dy.y * iw;
+ float gbb = gb0 * b0 + gb1 * b1;
+ float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+ float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+ float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+ float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+ float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+ float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+ float gp0w = -fx * gp0x - fy * gp0y;
+ float gp1w = -fx * gp1x - fy * gp1y;
+ float gp2w = -fx * gp2x - fy * gp2y;
+
+ // Bary differential gradients.
+ if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+ {
+ float dfxdX = p.xs * iw;
+ float dfydY = p.ys * iw;
+ ddb.x *= dfxdX;
+ ddb.y *= dfydY;
+ ddb.z *= dfxdX;
+ ddb.w *= dfydY;
+
+ float da0dX = p1.y * p2.w - p2.y * p1.w;
+ float da1dX = p2.y * p0.w - p0.y * p2.w;
+ float da2dX = p0.y * p1.w - p1.y * p0.w;
+ float da0dY = p2.x * p1.w - p1.x * p2.w;
+ float da1dY = p0.x * p2.w - p2.x * p0.w;
+ float da2dY = p1.x * p0.w - p0.x * p1.w;
+ float datdX = da0dX + da1dX + da2dX;
+ float datdY = da0dY + da1dY + da2dY;
+
+ float x01 = p0.x - p1.x;
+ float x12 = p1.x - p2.x;
+ float x20 = p2.x - p0.x;
+ float y01 = p0.y - p1.y;
+ float y12 = p1.y - p2.y;
+ float y20 = p2.y - p0.y;
+ float w01 = p0.w - p1.w;
+ float w12 = p1.w - p2.w;
+ float w20 = p2.w - p0.w;
+
+ float a0p1 = fy * p2.x - fx * p2.y;
+ float a0p2 = fx * p1.y - fy * p1.x;
+ float a1p0 = fx * p2.y - fy * p2.x;
+ float a1p2 = fy * p0.x - fx * p0.y;
+
+ float wdudX = 2.f * b0 * datdX - da0dX;
+ float wdudY = 2.f * b0 * datdY - da0dY;
+ float wdvdX = 2.f * b1 * datdX - da1dX;
+ float wdvdY = 2.f * b1 * datdY - da1dY;
+
+ float c0 = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+ float cx = c0 * fx - ddb.x * b0 - ddb.z * b1;
+ float cy = c0 * fy - ddb.y * b0 - ddb.w * b1;
+ float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+ float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+ gp0x += c0 * y12 - cy * w12 + czw * p2y + ddb.w * p2.w;
+ gp1x += c0 * y20 - cy * w20 - cxy * p2y - ddb.y * p2.w;
+ gp2x += c0 * y01 - cy * w01 + cxy * p1y - czw * p0y + ddb.y * p1.w - ddb.w * p0.w;
+ gp0y += cx * w12 - c0 * x12 - czw * p2x - ddb.z * p2.w;
+ gp1y += cx * w20 - c0 * x20 + cxy * p2x + ddb.x * p2.w;
+ gp2y += cx * w01 - c0 * x01 - cxy * p1x + czw * p0x - ddb.x * p1.w + ddb.z * p0.w;
+ gp0w += cy * x12 - cx * y12 - czw * a1p0 + ddb.z * p2.y - ddb.w * p2.x;
+ gp1w += cy * x20 - cx * y20 - cxy * a0p1 - ddb.x * p2.y + ddb.y * p2.x;
+ gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+ }
+
+ // Accumulate using coalesced atomics.
+ caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+ caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+ caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel (const RasterizeGradParams p) { RasterizeGradKernelTemplate(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate(p); }
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.h
new file mode 100644
index 0000000..cb3104f
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH 8
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT 8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// CUDA forward rasterizer shader kernel params.
+
+struct RasterizeCudaFwdShaderParams
+{
+ const float* pos; // Vertex positions.
+ const int* tri; // Triangle indices.
+ const int* in_idx; // Triangle idx buffer from rasterizer.
+ float* out; // Main output buffer.
+ float* out_db; // Bary pixel gradient output buffer.
+ int numTriangles; // Number of triangles.
+ int numVertices; // Number of vertices.
+ int width_in; // Input image width.
+ int height_in; // Input image height.
+ int width_out; // Output image width.
+ int height_out; // Output image height.
+ int depth; // Size of minibatch.
+ int instance_mode; // 1 if in instance rendering mode.
+ float xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+ const float* pos; // Incoming position buffer.
+ const int* tri; // Incoming triangle buffer.
+ const float* out; // Rasterizer output buffer.
+ const float* dy; // Incoming gradients of rasterizer output buffer.
+ const float* ddb; // Incoming gradients of bary diff output buffer.
+ float* grad; // Outgoing position gradients.
+ int numTriangles; // Number of triangles.
+ int numVertices; // Number of vertices.
+ int width; // Image width.
+ int height; // Image height.
+ int depth; // Size of minibatch.
+ int instance_mode; // 1 if in instance rendering mode.
+ float xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp
new file mode 100644
index 0000000..ac71ccd
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp
@@ -0,0 +1,644 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize_gl.h"
+#include "glutil.h"
+#include
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+ // Round x up so that it has at most y bits of mantissa.
+ if (x < (1u << y))
+ return x;
+ uint32_t m = 0;
+ while (x & ~m)
+ m = (m << 1) | 1u;
+ m >>= y;
+ if (!(x & m))
+ return x;
+ return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+ uint32_t count;
+ uint32_t instanceCount;
+ uint32_t firstIndex;
+ uint32_t baseVertex;
+ uint32_t baseInstance;
+};
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf)
+{
+ std::string src(src_buf);
+
+ // Set preprocessor directives.
+ int n = src.find('\n') + 1; // After first line containing #version directive.
+ if (s.enableZModify)
+ src.insert(n, "#define IF_ZMODIFY(x) x\n");
+ else
+ src.insert(n, "#define IF_ZMODIFY(x)\n");
+
+ const char *cstr = src.c_str();
+ *pShader = 0;
+ NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+ NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0));
+ NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+ *pProgram = 0;
+
+ GLuint glProgram = 0;
+ NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+ NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+ NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+ NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+ NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+ GLint linkStatus = 0;
+ NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+ if (!linkStatus)
+ {
+ GLint infoLen = 0;
+ NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+ if (infoLen)
+ {
+ const char* hdr = "glLinkProgram() failed:\n";
+ std::vector info(strlen(hdr) + infoLen);
+ strcpy(&info[0], hdr);
+ NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+ NVDR_CHECK(0, &info[0]);
+ }
+ NVDR_CHECK(0, "glLinkProgram() failed");
+ }
+
+ *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
+{
+ // Create GL context and set it current.
+ s.glctx = createGLContext(cudaDeviceIdx);
+ setGLContext(s.glctx);
+
+ // Version check.
+ GLint vMajor = 0;
+ GLint vMinor = 0;
+ glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+ glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+ glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+ LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+ NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+
+ // Enable depth modification workaround on A100 and later.
+ int capMajor = 0;
+ NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx));
+ s.enableZModify = (capMajor >= 8);
+
+ // Number of output buffers.
+ int num_outputs = s.enableDB ? 2 : 1;
+
+ // Set up vertex shader.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER,
+ "#version 330\n"
+ "#extension GL_ARB_shader_draw_parameters : enable\n"
+ STRINGIFY_SHADER_SOURCE(
+ layout(location = 0) in vec4 in_pos;
+ out int v_layer;
+ out int v_offset;
+ void main()
+ {
+ int layer = gl_DrawIDARB;
+ gl_Position = in_pos;
+ v_layer = layer;
+ v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+ }
+ )
+ );
+
+ // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+ if (s.enableDB)
+ {
+ // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+ // u = (u/w) / (1/w)
+ // --> du/dX = d((u/w) / (1/w))/dX
+ // --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+ // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+ "#version 430\n"
+ STRINGIFY_SHADER_SOURCE(
+ layout(triangles) in;
+ layout(triangle_strip, max_vertices=3) out;
+ layout(location = 0) uniform vec2 vp_scale;
+ in int v_layer[];
+ in int v_offset[];
+ out vec4 var_uvzw;
+ out vec4 var_db;
+ void main()
+ {
+ // Plane equations for bary differentials.
+ float w0 = gl_in[0].gl_Position.w;
+ float w1 = gl_in[1].gl_Position.w;
+ float w2 = gl_in[2].gl_Position.w;
+ vec2 p0 = gl_in[0].gl_Position.xy;
+ vec2 p1 = gl_in[1].gl_Position.xy;
+ vec2 p2 = gl_in[2].gl_Position.xy;
+ vec2 e0 = p0*w2 - p2*w0;
+ vec2 e1 = p1*w2 - p2*w1;
+ float a = e0.x*e1.y - e0.y*e1.x;
+
+ // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+ float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+ float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+ float ia = 1.f / ca; // Inverse area.
+
+ vec2 ascl = ia * vp_scale;
+ float dudx = e1.y * ascl.x;
+ float dudy = -e1.x * ascl.y;
+ float dvdx = -e0.y * ascl.x;
+ float dvdy = e0.x * ascl.y;
+
+ float duwdx = w2 * dudx;
+ float dvwdx = w2 * dvdx;
+ float duvdx = w0 * dudx + w1 * dvdx;
+ float duwdy = w2 * dudy;
+ float dvwdy = w2 * dvdy;
+ float duvdy = w0 * dudy + w1 * dvdy;
+
+ vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+ vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+ vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+ int layer_id = v_layer[0];
+ int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+ }
+ )
+ );
+
+ // Set up fragment shader.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+ "#version 430\n"
+ STRINGIFY_SHADER_SOURCE(
+ in vec4 var_uvzw;
+ in vec4 var_db;
+ layout(location = 0) out vec4 out_raster;
+ layout(location = 1) out vec4 out_db;
+ IF_ZMODIFY(
+ layout(location = 1) uniform float in_dummy;
+ )
+ void main()
+ {
+ int id_int = gl_PrimitiveID + 1;
+ float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+ out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+ out_db = var_db * var_uvzw.w;
+ IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+ }
+ )
+ );
+
+ // Set up fragment shader for depth peeling.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+ "#version 430\n"
+ STRINGIFY_SHADER_SOURCE(
+ in vec4 var_uvzw;
+ in vec4 var_db;
+ layout(binding = 0) uniform sampler2DArray out_prev;
+ layout(location = 0) out vec4 out_raster;
+ layout(location = 1) out vec4 out_db;
+ IF_ZMODIFY(
+ layout(location = 1) uniform float in_dummy;
+ )
+ void main()
+ {
+ int id_int = gl_PrimitiveID + 1;
+ float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+ vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+ float depth_new = var_uvzw.z / var_uvzw.w;
+ if (prev.w == 0 || depth_new <= prev.z)
+ discard;
+ out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float);
+ out_db = var_db * var_uvzw.w;
+ IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+ }
+ )
+ );
+ }
+ else
+ {
+ // Geometry shader without bary differential output.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+ "#version 330\n"
+ STRINGIFY_SHADER_SOURCE(
+ layout(triangles) in;
+ layout(triangle_strip, max_vertices=3) out;
+ in int v_layer[];
+ in int v_offset[];
+ out vec4 var_uvzw;
+ void main()
+ {
+ int layer_id = v_layer[0];
+ int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+ gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+ }
+ )
+ );
+
+ // Fragment shader without bary differential output.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+ "#version 430\n"
+ STRINGIFY_SHADER_SOURCE(
+ in vec4 var_uvzw;
+ layout(location = 0) out vec4 out_raster;
+ IF_ZMODIFY(
+ layout(location = 1) uniform float in_dummy;
+ )
+ void main()
+ {
+ int id_int = gl_PrimitiveID + 1;
+ float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+ out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+ IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+ }
+ )
+ );
+
+ // Depth peeling variant of fragment shader.
+ compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+ "#version 430\n"
+ STRINGIFY_SHADER_SOURCE(
+ in vec4 var_uvzw;
+ layout(binding = 0) uniform sampler2DArray out_prev;
+ layout(location = 0) out vec4 out_raster;
+ IF_ZMODIFY(
+ layout(location = 1) uniform float in_dummy;
+ )
+ void main()
+ {
+ int id_int = gl_PrimitiveID + 1;
+ float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+ vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+ float depth_new = var_uvzw.z / var_uvzw.w;
+ if (prev.w == 0 || depth_new <= prev.z)
+ discard;
+ out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+ IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+ }
+ )
+ );
+ }
+
+ // Finalize programs.
+ constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+ constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);
+
+ // Construct main fbo and bind permanently.
+ NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+ NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+ // Enable two color attachments.
+ GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+ NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+ // Construct vertex array object.
+ NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+ NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+ // Construct position buffer, bind permanently, enable, set ptr.
+ NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+ NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+ NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+ NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+ // Construct index buffer and bind permanently.
+ NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+ NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+ // Set up depth test.
+ NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+ NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+ NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
+
+ // Create and bind output buffers. Storage is allocated later.
+ NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+ for (int i=0; i < num_outputs; i++)
+ {
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+ NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+ }
+
+ // Create and bind depth/stencil buffer. Storage is allocated later.
+ NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+ NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+
+ // Create texture name for previous output buffer (depth peeling).
+ NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
+{
+ changes = false;
+
+ // Resize vertex buffer?
+ if (posCount > s.posCount)
+ {
+ if (s.cudaPosBuffer)
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+ s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+ LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+ NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+ changes = true;
+ }
+
+ // Resize triangle buffer?
+ if (triCount > s.triCount)
+ {
+ if (s.cudaTriBuffer)
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+ s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+ LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+ NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+ changes = true;
+ }
+
+ // Resize framebuffer?
+ if (width > s.width || height > s.height || depth > s.depth)
+ {
+ int num_outputs = s.enableDB ? 2 : 1;
+ if (s.cudaColorBuffer[0])
+ for (int i=0; i < num_outputs; i++)
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+ if (s.cudaPrevOutBuffer)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+ s.cudaPrevOutBuffer = 0;
+ }
+
+ // New framebuffer size.
+ s.width = (width > s.width) ? width : s.width;
+ s.height = (height > s.height) ? height : s.height;
+ s.depth = (depth > s.depth) ? depth : s.depth;
+ s.width = ROUND_UP(s.width, 32);
+ s.height = ROUND_UP(s.height, 32);
+ LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+
+ // Allocate color buffers.
+ for (int i=0; i < num_outputs; i++)
+ {
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+ NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+ }
+
+ // Allocate depth/stencil buffer.
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+ NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+ // (Re-)register all GL buffers into Cuda.
+ for (int i=0; i < num_outputs; i++)
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+
+ changes = true;
+ }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
+{
+ // Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
+ if (peeling_idx < 1)
+ {
+ if (triPtr)
+ {
+ // Copy both position and triangle buffers.
+ void* glPosPtr = NULL;
+ void* glTriPtr = NULL;
+ size_t posBytes = 0;
+ size_t triBytes = 0;
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+ NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+ NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+ }
+ else
+ {
+ // Copy position buffer only. Triangles are already copied and known to be constant.
+ void* glPosPtr = NULL;
+ size_t posBytes = 0;
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+ NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+ }
+ }
+
+ // Select program based on whether we have a depth peeling input or not.
+ if (peeling_idx < 1)
+ {
+ // Normal case: No peeling, or peeling disabled.
+ NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+ }
+ else
+ {
+ // If we don't have a third buffer yet, create one.
+ if (!s.cudaPrevOutBuffer)
+ {
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+ NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+ NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+ }
+
+ // Swap the GL buffers.
+ GLuint glTempBuffer = s.glPrevOutBuffer;
+ s.glPrevOutBuffer = s.glColorBuffer[0];
+ s.glColorBuffer[0] = glTempBuffer;
+
+ // Swap the Cuda buffers.
+ cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
+ s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
+ s.cudaColorBuffer[0] = cudaTempBuffer;
+
+ // Bind the new output buffer.
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
+ NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));
+
+ // Bind old buffer as the input texture.
+ NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+
+ // Activate the correct program.
+ NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
+ }
+
+ // Set viewport, clear color buffer(s) and depth/stencil buffer.
+ NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+ NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
+
+ // If outputting bary differentials, set resolution uniform
+ if (s.enableDB)
+ NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+
+ // Set the dummy uniform if depth modification workaround is active.
+ if (s.enableZModify)
+ NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f));
+
+ // Render the meshes.
+ if (depth == 1 && !rangesPtr)
+ {
+ // Trivial case.
+ NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+ }
+ else
+ {
+ // Populate a buffer for draw commands and execute it.
+ std::vector drawCmdBuffer(depth);
+
+ if (!rangesPtr)
+ {
+ // Fill in range array to instantiate the same triangles for each output layer.
+ // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+ // the first dimension in addressing the triangle array.
+ for (int i=0; i < depth; i++)
+ {
+ GLDrawCmd& cmd = drawCmdBuffer[i];
+ cmd.firstIndex = 0;
+ cmd.count = triCount;
+ cmd.baseVertex = vtxPerInstance * i;
+ cmd.baseInstance = 0;
+ cmd.instanceCount = 1;
+ }
+ }
+ else
+ {
+ // Fill in the range array according to user-given ranges. Triangle IDs point
+ // to the input triangle array, NOT index within range, so they correspond to
+ // the first dimension in addressing the triangle array.
+ for (int i=0, j=0; i < depth; i++)
+ {
+ GLDrawCmd& cmd = drawCmdBuffer[i];
+ int first = rangesPtr[j++];
+ int count = rangesPtr[j++];
+ NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+ NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+ cmd.firstIndex = first * 3;
+ cmd.count = count * 3;
+ cmd.baseVertex = 0;
+ cmd.baseInstance = first;
+ cmd.instanceCount = 1;
+ }
+ }
+
+ // Draw!
+ NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+ }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+ // Copy color buffers to output tensors.
+ cudaArray_t array = 0;
+ cudaChannelFormatDesc arrayDesc = {}; // For error checking.
+ cudaExtent arrayExt = {}; // For error checking.
+ int num_outputs = s.enableDB ? 2 : 1;
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+ for (int i=0; i < num_outputs; i++)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+ NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+ NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+ NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+ NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+ cudaMemcpy3DParms p = {0};
+ p.srcArray = array;
+ p.dstPtr.ptr = outputPtr[i];
+ p.dstPtr.pitch = width * 4 * sizeof(float);
+ p.dstPtr.xsize = width;
+ p.dstPtr.ysize = height;
+ p.extent.width = width;
+ p.extent.height = height;
+ p.extent.depth = depth;
+ p.kind = cudaMemcpyDeviceToDevice;
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+ }
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s)
+{
+ int num_outputs = s.enableDB ? 2 : 1;
+
+ if (s.cudaPosBuffer)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+ s.cudaPosBuffer = 0;
+ }
+
+ if (s.cudaTriBuffer)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+ s.cudaTriBuffer = 0;
+ }
+
+ for (int i=0; i < num_outputs; i++)
+ {
+ if (s.cudaColorBuffer[i])
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+ s.cudaColorBuffer[i] = 0;
+ }
+ }
+
+ if (s.cudaPrevOutBuffer)
+ {
+ NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+ s.cudaPrevOutBuffer = 0;
+ }
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.h
new file mode 100644
index 0000000..27537c5
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/rasterize_gl.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState // Must be initializable by memset to zero.
+{
+ int width; // Allocated frame buffer width.
+ int height; // Allocated frame buffer height.
+ int depth; // Allocated frame buffer depth.
+ int posCount; // Allocated position buffer in floats.
+ int triCount; // Allocated triangle buffer in ints.
+ GLContext glctx;
+ GLuint glFBO;
+ GLuint glColorBuffer[2];
+ GLuint glPrevOutBuffer;
+ GLuint glDepthStencilBuffer;
+ GLuint glVAO;
+ GLuint glTriBuffer;
+ GLuint glPosBuffer;
+ GLuint glProgram;
+ GLuint glProgramDP;
+ GLuint glVertexShader;
+ GLuint glGeometryShader;
+ GLuint glFragmentShader;
+ GLuint glFragmentShaderDP;
+ cudaGraphicsResource_t cudaColorBuffer[2];
+ cudaGraphicsResource_t cudaPrevOutBuffer;
+ cudaGraphicsResource_t cudaPosBuffer;
+ cudaGraphicsResource_t cudaTriBuffer;
+ int enableDB;
+ int enableZModify; // Modify depth in shader, workaround for a rasterization issue on A100.
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
+
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cpp b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cpp
new file mode 100644
index 0000000..51633e1
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+ char buf[1024];
+ int bufsz = 1024;
+
+ std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+
+ int w = p.texWidth;
+ int h = p.texHeight;
+ bool ew = false;
+ bool eh = false;
+
+ msg += "Attempted mip stack construction:\n";
+ msg += "level width height\n";
+ msg += "----- ----- ------\n";
+ snprintf(buf, bufsz, "base %5d %5d\n", w, h);
+ msg += buf;
+
+ int mipTotal = 0;
+ int level = 0;
+ while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+ {
+ // Current level.
+ level += 1;
+
+ // Determine if downsampling fails.
+ ew = ew || (w > 1 && (w & 1));
+ eh = eh || (h > 1 && (h & 1));
+
+ // Downsample.
+ if (w > 1) w >>= 1;
+ if (h > 1) h >>= 1;
+
+ // Append level size to error message.
+ snprintf(buf, bufsz, "mip %-2d ", level);
+ msg += buf;
+ if (ew) snprintf(buf, bufsz, " err ");
+ else snprintf(buf, bufsz, "%5d ", w);
+ msg += buf;
+ if (eh) snprintf(buf, bufsz, " err\n");
+ else snprintf(buf, bufsz, "%5d\n", h);
+ msg += buf;
+ }
+
+ NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets)
+{
+ // No levels at all?
+ if (p.mipLevelLimit == 0)
+ {
+ p.mipLevelMax = 0;
+ return 0;
+ }
+
+ // Current level size.
+ int w = p.texWidth;
+ int h = p.texHeight;
+
+ int mipTotal = 0;
+ int level = 0;
+ int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+ mipOffsets[0] = 0;
+ while ((w|h) > 1)
+ {
+ // Current level.
+ level += 1;
+
+ // Quit if cannot downsample.
+ if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+ raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+ // Downsample.
+ if (w > 1) w >>= 1;
+ if (h > 1) h >>= 1;
+
+ mipOffsets[level] = mipTotal; // Store the mip offset (#floats).
+ mipTotal += w * h * p.texDepth * c;
+
+ // Hit the level limit?
+ if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+ break;
+ }
+
+ p.mipLevelMax = level;
+ return mipTotal;
+}
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cu
new file mode 100644
index 0000000..490b8d6
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.cu
@@ -0,0 +1,1156 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float& a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); }
+static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); }
+template static __device__ __forceinline__ T lerp (const T& a, const T& b, float c) { return a + c * (b - a); }
+template static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+ 0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+ 0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+ 0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+ 0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+ 0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+ 0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+ 0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+ 0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+ 0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+ // Calculate case number.
+ int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+ int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+ int c = cx + cy;
+ if (c >= 5)
+ c--;
+ c = (face << 3) + c;
+
+ // Compute coordinates and faces.
+ unsigned int m = c_cubeWrapMask1[c];
+ int x0 = (m >> 0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+ int x1 = (m >> 2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+ int x2 = (m >> 4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+ int x3 = (m >> 6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+ int y0 = (m >> 8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+ int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+ int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+ int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+ int f0 = ((m >> 16) & 15) - 1;
+ int f1 = ((m >> 20) & 15) - 1;
+ int f2 = ((m >> 24) & 15) - 1;
+ int f3 = ((m >> 28) ) - 1;
+
+ // Flips.
+ unsigned int f = c_cubeWrapMask2[c];
+ int w1 = w - 1;
+ if (f & 0x01) x0 = w1 - x0;
+ if (f & 0x02) x1 = w1 - x1;
+ if (f & 0x04) x2 = w1 - x2;
+ if (f & 0x08) x3 = w1 - x3;
+ if (f & 0x10) y0 = w1 - y0;
+ if (f & 0x20) y1 = w1 - y1;
+ if (f & 0x40) y2 = w1 - y2;
+ if (f & 0x80) y3 = w1 - y3;
+
+ // Done.
+ int4 tcOut;
+ tcOut.x = x0 + (y0 + f0 * w) * w;
+ tcOut.y = x1 + (y1 + f1 * w) * w;
+ tcOut.z = x2 + (y2 + f2 * w) * w;
+ tcOut.w = x3 + (y3 + f3 * w) * w;
+ return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+ float ax = fabsf(x);
+ float ay = fabsf(y);
+ float az = fabsf(z);
+ int idx;
+ float c;
+ if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+ else if (ay > ax) { idx = 2; c = y; y = z; }
+ else { idx = 0; c = x; x = z; }
+ if (c < 0.f) idx += 1;
+ float m = __frcp_rz(fabsf(c)) * .5;
+ float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+ float m1 = (idx != 2) ? -m : m;
+ x = x * m0 + .5;
+ y = y * m1 + .5;
+ if (!isfinite(x) || !isfinite(y))
+ return -1; // Invalid uv.
+ x = fminf(fmaxf(x, 0.f), 1.f);
+ y = fminf(fmaxf(y, 0.f), 1.f);
+ return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+ float ax = fabsf(uv.x);
+ float ay = fabsf(uv.y);
+ float az = fabsf(uv.z);
+ int idx;
+ float c;
+ float c0 = gu;
+ float c1 = gv;
+ if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+ else if (ay > ax) { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+ else { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+ if (c < 0.f) idx += idx;
+ float m = __frcp_rz(fabsf(c));
+ c0 = (idx & 0x34) ? -c0 : c0;
+ c1 = (idx & 0x2e) ? -c1 : c1;
+ float gl = (c0 + c1) * m;
+ float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+ float gy = (idx & 0x0c) ? gl : -gv;
+ float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+ gz = (idx & 0x09) ? -gz : gz;
+ float3 res = make_float3(gx, gy, gz) * (m * .5f);
+ if (!isfinite_vec3(res))
+ return make_float3(0.f, 0.f, 0.f); // Invalid uv.
+ return res;
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+ float ax = fabsf(uv.x);
+ float ay = fabsf(uv.y);
+ float az = fabsf(uv.z);
+ int idx;
+ float c, c0, c1;
+ if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+ else if (ay > ax) { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+ else { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+ if (c < 0.f) idx += idx;
+ float m = __frcp_rz(fabsf(c));
+ c0 = (idx & 0x34) ? -c0 : c0;
+ c1 = (idx & 0x2e) ? -c1 : c1;
+ float gl0 = (dw.x * c0 + dw.z * c1) * m;
+ float gl1 = (dw.y * c0 + dw.w * c1) * m;
+ float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+ float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+ float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+ float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+ float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+ float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+ if (idx & 0x09)
+ {
+ gz0 = -gz0;
+ gz1 = -gz1;
+ }
+ g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+ g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+ if (!isfinite_vec3(g0) || !isfinite_vec3(g1))
+ {
+ g0 = make_float3(0.f, 0.f, 0.f); // Invalid uv.
+ g1 = make_float3(0.f, 0.f, 0.f);
+ }
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+ float ax = fabsf(uv.x);
+ float ay = fabsf(uv.y);
+ float az = fabsf(uv.z);
+ int idx;
+ float c, gu, gv;
+ if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+ else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+ else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+ if (c < 0.f) idx += idx;
+ if (idx & 0x09)
+ {
+ dvdX.z = -dvdX.z;
+ dvdY.z = -dvdY.z;
+ }
+ float m = __frcp_rz(fabsf(c));
+ float dm = m * .5f;
+ float mm = m * dm;
+ gu *= (idx & 0x34) ? -mm : mm;
+ gv *= (idx & 0x2e) ? -mm : mm;
+
+ float4 res;
+ if (idx & 0x03)
+ {
+ res = make_float4(gu * dvdX.x + dm * dvdX.z,
+ gu * dvdY.x + dm * dvdY.z,
+ gv * dvdX.x - dm * dvdX.y,
+ gv * dvdY.x - dm * dvdY.y);
+ }
+ else if (idx & 0x0c)
+ {
+ res = make_float4(gu * dvdX.y + dm * dvdX.x,
+ gu * dvdY.y + dm * dvdY.x,
+ gv * dvdX.y + dm * dvdX.z,
+ gv * dvdY.y + dm * dvdY.z);
+ }
+ else // (idx & 0x30)
+ {
+ res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+ gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+ gv * dvdX.z - dm * dvdX.y,
+ gv * dvdY.z - dm * dvdY.y);
+ }
+
+ if (!isfinite_vec4(res))
+ return make_float4(0.f, 0.f, 0.f, 0.f);
+
+ return res;
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+// | d(ds/dX)/dx d(ds/dY)/dx d(dt/dX)/dx d(dt/dY)/dx |
+// | d(ds/dX)/dy d(ds/dY)/dy d(dt/dX)/dy d(dt/dY)/dy |
+// | d(ds/dX)/dz d(ds/dY)/dz d(dt/dX)/dz d(dt/dY)/dz |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+ float ax = fabsf(uv.x);
+ float ay = fabsf(uv.y);
+ float az = fabsf(uv.z);
+ int idx;
+ float c, gu, gv;
+ if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+ else if (ay > ax) { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+ else { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+ if (c < 0.f) idx += idx;
+
+ if (idx & 0x09)
+ {
+ dvdX.z = -dvdX.z;
+ dvdY.z = -dvdY.z;
+ }
+
+ float m = __frcp_rz(c);
+ float dm = -m * fabsf(m) * .5;
+ float mm = m * m * .5;
+ float mu = (idx & 0x34) ? -mm : mm;
+ float mv = (idx & 0x2e) ? -mm : mm;
+ gu *= -2.0 * m * mu;
+ gv *= -2.0 * m * mv;
+
+ if (idx & 0x03)
+ {
+ dx.x = gu * dvdX.x + dm * dvdX.z;
+ dx.y = gu * dvdY.x + dm * dvdY.z;
+ dx.z = gv * dvdX.x - dm * dvdX.y;
+ dx.w = gv * dvdY.x - dm * dvdY.y;
+ dy.x = 0.f;
+ dy.y = 0.f;
+ dy.z = mv * dvdX.x;
+ dy.w = mv * dvdY.x;
+ dz.x = mu * dvdX.x;
+ dz.y = mu * dvdY.x;
+ dz.z = 0.f;
+ dz.w = 0.f;
+ }
+ else if (idx & 0x0c)
+ {
+ dx.x = mu * dvdX.y;
+ dx.y = mu * dvdY.y;
+ dx.z = 0.f;
+ dx.w = 0.f;
+ dy.x = gu * dvdX.y + dm * dvdX.x;
+ dy.y = gu * dvdY.y + dm * dvdY.x;
+ dy.z = gv * dvdX.y + dm * dvdX.z;
+ dy.w = gv * dvdY.y + dm * dvdY.z;
+ dz.x = 0.f;
+ dz.y = 0.f;
+ dz.z = mv * dvdX.y;
+ dz.w = mv * dvdY.y;
+ }
+ else // (idx & 0x30)
+ {
+ dx.x = mu * dvdX.z;
+ dx.y = mu * dvdY.z;
+ dx.z = 0.f;
+ dx.w = 0.f;
+ dy.x = 0.f;
+ dy.y = 0.f;
+ dy.z = mv * dvdX.z;
+ dy.w = mv * dvdY.z;
+ dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+ dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+ dz.z = gv * dvdX.z - dm * dvdX.y;
+ dz.w = gv * dvdY.z - dm * dvdY.y;
+ }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+ int w = p.texWidth;
+ int h = p.texHeight;
+ float u = uv.x;
+ float v = uv.y;
+
+ // Cube map indexing.
+ if (CUBE_MODE)
+ {
+ // No wrap. Fold face index into tz right away.
+ int idx = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+ if (idx < 0)
+ return -1; // Invalid uv.
+ tz = 6 * tz + idx;
+ }
+ else
+ {
+ // Handle boundary.
+ if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+ {
+ u = u - (float)__float2int_rd(u);
+ v = v - (float)__float2int_rd(v);
+ }
+ }
+
+ u = u * (float)w;
+ v = v * (float)h;
+
+ int iu = __float2int_rd(u);
+ int iv = __float2int_rd(v);
+
+ // In zero boundary mode, return texture address -1.
+ if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+ {
+ if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+ return -1;
+ }
+
+ // Otherwise clamp and calculate the coordinate properly.
+ iu = min(max(iu, 0), w-1);
+ iv = min(max(iv, 0), h-1);
+ return iu + w * (iv + tz * h);
+}
+
+template
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+ // Mip level size.
+ int2 sz = mipLevelSize(p, level);
+ int w = sz.x;
+ int h = sz.y;
+
+ // Compute texture-space u, v.
+ float u = uv.x;
+ float v = uv.y;
+ bool clampU = false;
+ bool clampV = false;
+
+ // Cube map indexing.
+ int face = 0;
+ if (CUBE_MODE)
+ {
+ // Neither clamp or wrap.
+ face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+ if (face < 0)
+ {
+ tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; // Invalid uv.
+ return make_float2(0.f, 0.f);
+ }
+ u = u * (float)w - 0.5f;
+ v = v * (float)h - 0.5f;
+ }
+ else
+ {
+ if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+ {
+ // Wrap.
+ u = u - (float)__float2int_rd(u);
+ v = v - (float)__float2int_rd(v);
+ }
+
+ // Move to texel space.
+ u = u * (float)w - 0.5f;
+ v = v * (float)h - 0.5f;
+
+ if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+ {
+ // Clamp to center of edge texels.
+ u = fminf(fmaxf(u, 0.f), w - 1.f);
+ v = fminf(fmaxf(v, 0.f), h - 1.f);
+ clampU = (u == 0.f || u == w - 1.f);
+ clampV = (v == 0.f || v == h - 1.f);
+ }
+ }
+
+ // Compute texel coordinates and weights.
+ int iu0 = __float2int_rd(u);
+ int iv0 = __float2int_rd(v);
+ int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+ int iv1 = iv0 + (clampV ? 0 : 1);
+ u -= (float)iu0;
+ v -= (float)iv0;
+
+ // Cube map wrapping.
+ bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+ if (cubeWrap)
+ {
+ tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+ tcOut += 6 * tz * w * h; // Bring in tz.
+ return make_float2(u, v); // Done.
+ }
+
+ // Fold cube map face into tz.
+ if (CUBE_MODE)
+ tz = 6 * tz + face;
+
+ // Wrap overflowing texel indices.
+ if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+ {
+ if (iu0 < 0) iu0 += w;
+ if (iv0 < 0) iv0 += h;
+ if (iu1 >= w) iu1 -= w;
+ if (iv1 >= h) iv1 -= h;
+ }
+
+ // Coordinates with tz folded in.
+ int iu0z = iu0 + tz * w * h;
+ int iu1z = iu1 + tz * w * h;
+ tcOut.x = iu0z + w * iv0;
+ tcOut.y = iu1z + w * iv0;
+ tcOut.z = iu0z + w * iv1;
+ tcOut.w = iu1z + w * iv1;
+
+ // Invalidate texture addresses outside unit square if we are in zero mode.
+ if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+ {
+ bool iu0_out = (iu0 < 0 || iu0 >= w);
+ bool iu1_out = (iu1 < 0 || iu1 >= w);
+ bool iv0_out = (iv0 < 0 || iv0 >= h);
+ bool iv1_out = (iv1 < 0 || iv1 >= h);
+ if (iu0_out || iv0_out) tcOut.x = -1;
+ if (iu1_out || iv0_out) tcOut.y = -1;
+ if (iu0_out || iv1_out) tcOut.z = -1;
+ if (iu1_out || iv1_out) tcOut.w = -1;
+ }
+
+ // All done.
+ return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+ // Do nothing if mips not in use.
+ if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+ return;
+
+ // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+ if (!BIAS_ONLY)
+ {
+ // Get pixel derivatives of texture coordinates.
+ float4 uvDA;
+ float3 dvdX, dvdY; // Gradients use these later.
+ if (CUBE_MODE)
+ {
+ // Fetch.
+ float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+ float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+ float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
+
+ // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+ dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+ dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+ uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+ }
+ else
+ {
+ // Fetch.
+ uvDA = ((const float4*)p.uvDA)[pidx];
+ }
+
+ // Scaling factors.
+ float uscl = p.texWidth;
+ float vscl = p.texHeight;
+
+ // d[s,t]/d[X,Y].
+ float dsdx = uvDA.x * uscl;
+ float dsdy = uvDA.y * uscl;
+ float dtdx = uvDA.z * vscl;
+ float dtdy = uvDA.w * vscl;
+
+ // Calculate footprint axis lengths.
+ float A = dsdx*dsdx + dtdx*dtdx;
+ float B = dsdy*dsdy + dtdy*dtdy;
+ float C = dsdx*dsdy + dtdx*dtdy;
+ float l2b = 0.5 * (A + B);
+ float l2n = 0.25 * (A-B)*(A-B) + C*C;
+ float l2a = sqrt(l2n);
+ float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+ float lenMajorSqr = l2b + l2a;
+
+ // Footprint vs. mip level gradient.
+ if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+ {
+ float dw = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+ float AB = dw * .5f * (A - B);
+ float Cw = dw * C;
+ float l2aw = dw * l2a;
+ float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+ float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+ float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+ float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+ float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+ if (!CUBE_MODE)
+ *pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+
+ // In cube maps, there is also a texture coordinate vs. mip level gradient.
+ // Only output nonzero vectors if both are free of inf/Nan garbage.
+ if (CUBE_MODE)
+ {
+ float4 dx, dy, dz;
+ indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+ float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+ float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+ float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+ float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+ float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+ d_f_dv += d_dsdX_dv * d_f_ddsdX;
+ d_f_dv += d_dsdY_dv * d_f_ddsdY;
+ d_f_dv += d_dtdX_dv * d_f_ddtdX;
+ d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+ bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv);
+ *pdw = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+ *pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f);
+ }
+ }
+
+ // Finally, calculate mip level.
+ flevel = .5f * __log2f(lenMajorSqr); // May be inf/NaN, but clamp fixes it.
+ }
+
+ // Bias the mip level and clamp.
+ if (p.mipLevelBias)
+ flevel += p.mipLevelBias[pidx];
+ flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+ // Calculate levels depending on filter mode.
+ level0 = __float2int_rd(flevel);
+
+ // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+ if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+ {
+ level1 = min(level0 + 1, p.mipLevelMax);
+ flevel -= level0; // Fractional part. Zero if clamped on last level.
+ }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+ // For invalid cube map uv, tc will be all negative, and all texel values will be zero.
+ if (corner)
+ {
+ T avg = zero_value();
+ if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+ if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+ if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+ if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+ avg *= 0.33333333f;
+ if (tc.x < 0) a00 = avg;
+ if (tc.y < 0) a10 = avg;
+ if (tc.z < 0) a01 = avg;
+ if (tc.w < 0) a11 = avg;
+ }
+ else
+ {
+ a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value();
+ a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value();
+ a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value();
+ a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value();
+ }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+ // For invalid cube map uv, tc will be all negative, and no accumulation will take place.
+ if (corner)
+ {
+ float cb;
+ if (tc.x < 0) cb = c.x;
+ if (tc.y < 0) cb = c.y;
+ if (tc.z < 0) cb = c.z;
+ if (tc.w < 0) cb = c.w;
+ cb *= 0.33333333f;
+ if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+ if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+ if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+ if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+ }
+ else
+ {
+ if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+ if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+ if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+ if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+ }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+ // Sizes.
+ int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+ int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= sz_out.x || py >= sz_out.y)
+ return;
+
+ // Pixel indices.
+ int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+ int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+ int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+ // Input and output pointers.
+ const float* pin = p.tex[p.mipLevelOut - 1];
+ float* pout = (float*)p.tex[p.mipLevelOut];
+
+ // Special case: Input texture height or width is 1.
+ if (sz_in.x == 1 || sz_in.y == 1)
+ {
+ if (sz_in.y == 1)
+ pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+ for (int i=0; i < p.channels; i += C)
+ {
+ T v0 = *((const T*)&pin[pidx_in0 + i]);
+ T v1 = *((const T*)&pin[pidx_in1 + i]);
+ T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+ avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+ *((T*)&pout[pidx_out + i]) = avg;
+ }
+
+ return;
+ }
+
+ for (int i=0; i < p.channels; i += C)
+ {
+ T v0 = *((const T*)&pin[pidx_in0 + i]);
+ T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+ T v2 = *((const T*)&pin[pidx_in1 + i]);
+ T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+ T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+ avg = (avg - .5f) * 2.f + .5f;
+#endif
+ *((T*)&pout[pidx_out + i]) = avg;
+ }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ int tz = (p.texDepth == 1) ? 0 : pz;
+ if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+ return;
+
+ // Pixel index.
+ int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+ // Output ptr.
+ float* pOut = p.out + pidx * p.channels;
+
+ // Get UV.
+ float3 uv;
+ if (CUBE_MODE)
+ uv = ((const float3*)p.uv)[pidx];
+ else
+ uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+ // Nearest mode.
+ if (FILTER_MODE == TEX_MODE_NEAREST)
+ {
+ int tc = indexTextureNearest(p, uv, tz);
+ tc *= p.channels;
+ const float* pIn = p.tex[0];
+
+ // Copy if valid tc, otherwise output zero.
+ for (int i=0; i < p.channels; i += C)
+ *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value();
+
+ return; // Exit.
+ }
+
+ // Calculate mip level. In 'linear' mode these will all stay zero.
+ float flevel = 0.f; // Fractional level.
+ int level0 = 0; // Discrete level 0.
+ int level1 = 0; // Discrete level 1.
+ calculateMipLevel(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+ // Get texel indices and pointer for level 0.
+ int4 tc0 = make_int4(0, 0, 0, 0);
+ float2 uv0 = indexTextureLinear(p, uv, tz, tc0, level0);
+ const float* pIn0 = p.tex[level0];
+ bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+ tc0 *= p.channels;
+
+ // Bilinear fetch.
+ if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+ {
+ // Interpolate.
+ for (int i=0; i < p.channels; i += C, tc0 += C)
+ {
+ T a00, a10, a01, a11;
+ fetchQuad(a00, a10, a01, a11, pIn0, tc0, corner0);
+ *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+ }
+ return; // Exit.
+ }
+
+ // Get texel indices and pointer for level 1.
+ int4 tc1 = make_int4(0, 0, 0, 0);
+ float2 uv1 = indexTextureLinear(p, uv, tz, tc1, level1);
+ const float* pIn1 = p.tex[level1];
+ bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+ tc1 *= p.channels;
+
+ // Trilinear fetch.
+ for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+ {
+ // First level.
+ T a00, a10, a01, a11;
+ fetchQuad(a00, a10, a01, a11, pIn0, tc0, corner0);
+ T a = bilerp(a00, a10, a01, a11, uv0);
+
+ // Second level unless in magnification mode.
+ if (flevel > 0.f)
+ {
+ T b00, b10, b01, b11;
+ fetchQuad(b00, b10, b01, b11, pIn1, tc1, corner1);
+ T b = bilerp(b00, b10, b01, b11, uv1);
+ a = lerp(a, b, flevel); // Interpolate between levels.
+ }
+
+ // Write.
+ *((T*)&pOut[i]) = a;
+ }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4 (const TextureKernelParams p) { TextureFwdKernelTemplate(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ if (px >= p.texWidth || py >= p.texHeight)
+ return;
+
+ // Number of wide elements.
+ int c = p.channels;
+ if (C == 2) c >>= 1;
+ if (C == 4) c >>= 2;
+
+ // Dynamically allocated shared memory for holding a texel.
+ extern __shared__ float s_texelAccum[];
+ int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+ int sharedStride = blockDim.x * blockDim.y;
+# define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+ // Clear the texel.
+ for (int i=0; i < p.channels; i++)
+ *TEXEL_ACCUM(i) = 0.f;
+
+ // Track texel position and accumulation weight over the mip stack.
+ int x = px;
+ int y = py;
+ float w = 1.f;
+
+ // Pull gradients from all levels.
+ int2 sz = mipLevelSize(p, 0); // Previous level size.
+ for (int level=1; level <= p.mipLevelMax; level++)
+ {
+ // Weight decay depends on previous level size.
+ if (sz.x > 1) w *= .5f;
+ if (sz.y > 1) w *= .5f;
+
+ // Current level size and coordinates.
+ sz = mipLevelSize(p, level);
+ x >>= 1;
+ y >>= 1;
+
+ T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+ for (int i=0; i < c; i++)
+ accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+ }
+
+ // Add to main texture gradients.
+ T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+ for (int i=0; i < c; i++)
+ accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+ // Temporary space for coalesced atomics.
+ CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+ // Calculate pixel position.
+ int px = blockIdx.x * blockDim.x + threadIdx.x;
+ int py = blockIdx.y * blockDim.y + threadIdx.y;
+ int pz = blockIdx.z;
+ int tz = (p.texDepth == 1) ? 0 : pz;
+ if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+ return;
+
+ // Pixel index.
+ int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+ // Early exit if output gradients are zero.
+ const float* pDy = p.dy + pidx * p.channels;
+ unsigned int dmax = 0u;
+ if ((p.channels & 3) == 0)
+ {
+ for (int i=0; i < p.channels; i += 4)
+ {
+ uint4 dy = *((const uint4*)&pDy[i]);
+ dmax |= (dy.x | dy.y | dy.z | dy.w);
+ }
+ }
+ else
+ {
+ for (int i=0; i < p.channels; i++)
+ dmax |= __float_as_uint(pDy[i]);
+ }
+
+ // Store zeros and exit.
+ if (__uint_as_float(dmax) == 0.f)
+ {
+ if (CUBE_MODE)
+ {
+ if (FILTER_MODE != TEX_MODE_NEAREST)
+ ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+ if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+ {
+ if (p.gradUVDA)
+ {
+ ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+ ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+ ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+ }
+ if (p.gradMipLevelBias)
+ p.gradMipLevelBias[pidx] = 0.f;
+ }
+ }
+ else
+ {
+ if (FILTER_MODE != TEX_MODE_NEAREST)
+ ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+ if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+ {
+ if (p.gradUVDA)
+ ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+ if (p.gradMipLevelBias)
+ p.gradMipLevelBias[pidx] = 0.f;
+ }
+ }
+ return;
+ }
+
+ // Get UV.
+ float3 uv;
+ if (CUBE_MODE)
+ uv = ((const float3*)p.uv)[pidx];
+ else
+ uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+ // Nearest mode - texture gradients only.
+ if (FILTER_MODE == TEX_MODE_NEAREST)
+ {
+ int tc = indexTextureNearest(p, uv, tz);
+ if (tc < 0)
+ return; // Outside texture.
+
+ tc *= p.channels;
+ float* pOut = p.gradTex[0];
+
+ // Accumulate texture gradients.
+ for (int i=0; i < p.channels; i++)
+ caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+ return; // Exit.
+ }
+
+ // Calculate mip level. In 'linear' mode these will all stay zero.
+ float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+ float3 dfdv = make_float3(0.f, 0.f, 0.f);
+ float flevel = 0.f; // Fractional level.
+ int level0 = 0; // Discrete level 0.
+ int level1 = 0; // Discrete level 1.
+ calculateMipLevel(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+ // UV gradient accumulators.
+ float gu = 0.f;
+ float gv = 0.f;
+
+ // Get texel indices and pointers for level 0.
+ int4 tc0 = make_int4(0, 0, 0, 0);
+ float2 uv0 = indexTextureLinear(p, uv, tz, tc0, level0);
+ const float* pIn0 = p.tex[level0];
+ float* pOut0 = p.gradTex[level0];
+ bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+ tc0 *= p.channels;
+
+ // Texel weights.
+ float uv011 = uv0.x * uv0.y;
+ float uv010 = uv0.x - uv011;
+ float uv001 = uv0.y - uv011;
+ float uv000 = 1.f - uv0.x - uv001;
+ float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+ // Attribute weights.
+ int2 sz0 = mipLevelSize(p, level0);
+ float sclu0 = (float)sz0.x;
+ float sclv0 = (float)sz0.y;
+
+ // Bilinear mode - texture and uv gradients.
+ if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+ {
+ for (int i=0; i < p.channels; i++, tc0 += 1)
+ {
+ float dy = pDy[i];
+ accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+
+ float a00, a10, a01, a11;
+ fetchQuad(a00, a10, a01, a11, pIn0, tc0, corner0);
+ float ad = (a11 + a00 - a10 - a01);
+ gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+ gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+ }
+
+ // Store UV gradients and exit.
+ if (CUBE_MODE)
+ ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+ else
+ ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+ return;
+ }
+
+ // Accumulate fractional mip level gradient.
+ float df = 0; // dL/df.
+
+ // Get texel indices and pointers for level 1.
+ int4 tc1 = make_int4(0, 0, 0, 0);
+ float2 uv1 = indexTextureLinear(p, uv, tz, tc1, level1);
+ const float* pIn1 = p.tex[level1];
+ float* pOut1 = p.gradTex[level1];
+ bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+ tc1 *= p.channels;
+
+ // Texel weights.
+ float uv111 = uv1.x * uv1.y;
+ float uv110 = uv1.x - uv111;
+ float uv101 = uv1.y - uv111;
+ float uv100 = 1.f - uv1.x - uv101;
+ float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+ // Attribute weights.
+ int2 sz1 = mipLevelSize(p, level1);
+ float sclu1 = (float)sz1.x;
+ float sclv1 = (float)sz1.y;
+
+ // Trilinear mode.
+ for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+ {
+ float dy = pDy[i];
+ float dy0 = (1.f - flevel) * dy;
+ accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+ // UV gradients for first level.
+ float a00, a10, a01, a11;
+ fetchQuad(a00, a10, a01, a11, pIn0, tc0, corner0);
+ float ad = (a11 + a00 - a10 - a01);
+ gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+ gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+ // Second level unless in magnification mode.
+ if (flevel > 0.f)
+ {
+ // Texture gradients for second level.
+ float dy1 = flevel * dy;
+ accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+
+ // UV gradients for second level.
+ float b00, b10, b01, b11;
+ fetchQuad(b00, b10, b01, b11, pIn1, tc1, corner1);
+ float bd = (b11 + b00 - b10 - b01);
+ gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+ gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+ // Mip level gradient.
+ float a = bilerp(a00, a10, a01, a11, uv0);
+ float b = bilerp(b00, b10, b01, b11, uv1);
+ df += (b-a) * dy;
+ }
+ }
+
+ // Store UV gradients.
+ if (CUBE_MODE)
+ ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+ else
+ ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+ // Store mip level bias gradient.
+ if (p.gradMipLevelBias)
+ p.gradMipLevelBias[pidx] = df;
+
+ // Store UV pixel differential gradients.
+ if (!BIAS_ONLY)
+ {
+ // Final gradients.
+ dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+ // Store them.
+ if (CUBE_MODE)
+ {
+ // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+ float3 g0, g1;
+ indexCubeMapGrad4(uv, dw, g0, g1);
+ ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+ ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+ ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+ }
+ else
+ ((float4*)p.gradUVDA)[pidx] = dw;
+ }
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelLinear (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeNearest (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeLinear (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO (const TextureKernelParams p) { TextureGradKernelTemplate(p); }
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.h b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.h
new file mode 100644
index 0000000..f79b600
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/common/texture.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE 0 // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH 8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH 8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT 8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH 8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH 8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT 8
+#define TEX_MAX_MIP_LEVEL 16 // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST 0 // Nearest on base level.
+#define TEX_MODE_LINEAR 1 // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST 2 // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR 3 // Trilinear.
+#define TEX_MODE_COUNT 4
+#define TEX_BOUNDARY_MODE_CUBE 0 // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP 1 // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP 2 // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO 3 // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+ const float* tex[TEX_MAX_MIP_LEVEL]; // Incoming texture buffer with mip levels.
+ const float* uv; // Incoming texcoord buffer.
+ const float* uvDA; // Incoming uv pixel diffs or NULL.
+ const float* mipLevelBias; // Incoming mip level bias or NULL.
+ const float* dy; // Incoming output gradient.
+ float* out; // Outgoing texture data.
+ float* gradTex[TEX_MAX_MIP_LEVEL]; // Outgoing texture gradients with mip levels.
+ float* gradUV; // Outgoing texcoord gradient.
+ float* gradUVDA; // Outgoing texcoord pixel differential gradient.
+ float* gradMipLevelBias; // Outgoing mip level bias gradient.
+ int enableMip; // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
+ int filterMode; // One of the TEX_MODE_ constants.
+ int boundaryMode; // One of the TEX_BOUNDARY_MODE_ contants.
+ int texConst; // If true, texture is known to be constant.
+ int mipLevelLimit; // Mip level limit coming from the op.
+ int channels; // Number of texture channels.
+ int imgWidth; // Image width.
+ int imgHeight; // Image height.
+ int texWidth; // Texture width.
+ int texHeight; // Texture height.
+ int texDepth; // Texture depth.
+ int n; // Minibatch size.
+ int mipLevelMax; // Maximum mip level index. Zero if mips disabled.
+ int mipLevelOut; // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/lib/setgpu.lib b/LAM_gpro/external/nvdiffrast/nvdiffrast/lib/setgpu.lib
new file mode 100644
index 0000000..add9a0c
Binary files /dev/null and b/LAM_gpro/external/nvdiffrast/nvdiffrast/lib/setgpu.lib differ
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/__init__.py b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/__init__.py
new file mode 100644
index 0000000..cf62df8
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/ops.py b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/ops.py
new file mode 100644
index 0000000..be51dee
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/ops.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+ libs = {
+ 'posix': ['GL', 'EGL'],
+ 'nt': ['gdi32', 'opengl32', 'user32', 'setgpu'],
+ }
+ return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+ fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+ return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+ try:
+ return np.asarray(x, dtype=dtype)
+ except (TypeError, ValueError):
+ return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+ if isinstance(x, np.ndarray):
+ return np.can_cast(x.dtype, dtype, 'unsafe')
+ else:
+ return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+ assert tri_const is True or tri_const is False
+ assert output_db is True or output_db is False
+
+ # Known constant resolution?
+ resolution_c = _get_constant(resolution, np.int32)
+
+ # Known constant triangles?
+ tri_const = tri_const or _is_constant(tri, np.int32)
+
+ # Convert all inputs to tensors / base types.
+ tri_const = 1 if tri_const else 0
+ tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+ pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+ resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+ if ranges is None:
+ ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+ else:
+ ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+ # Infer as much about the output shape as possible.
+ out_shape = [None, None, None, 4]
+ if pos.shape.rank == 3: # Instanced mode.
+ out_shape[0] = pos.shape[0].value
+ elif pos.shape.rank == 2: # Range mode.
+ if ranges.shape.rank not in [None, 0]:
+ out_shape[0] = ranges.shape[0].value
+ if resolution_c is not None:
+ assert resolution_c.shape == (2,)
+ out_shape[1], out_shape[2] = resolution_c
+
+ # Output pixel differentials.
+ @tf.custom_gradient
+ def func_db(pos):
+ out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+ out.set_shape(out_shape)
+ out_db.set_shape(out_shape)
+ def grad(dy, ddb):
+ if grad_db:
+ return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+ else:
+ return _get_plugin().rasterize_grad(pos, tri, out, dy)
+ return (out, out_db), grad
+
+ # Do not output pixel differentials.
+ @tf.custom_gradient
+ def func(pos):
+ out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+ out.set_shape(out_shape)
+ out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+ def grad(dy, _):
+ return _get_plugin().rasterize_grad(pos, tri, out, dy)
+ return (out, out_db), grad
+
+ # Choose stub.
+ if output_db:
+ return func_db(pos)
+ else:
+ return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+ # Sanitize the list of pixel differential attributes.
+ if diff_attrs is None:
+ diff_attrs = []
+ elif diff_attrs != 'all':
+ diff_attrs = _get_constant(diff_attrs, np.int32)
+ assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+ diff_attrs = diff_attrs.tolist()
+
+ # Convert all inputs to tensors.
+ attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+ rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+ tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+ if diff_attrs:
+ rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+ # Infer output shape.
+ out_shape = [None, None, None, None]
+ if rast.shape.rank is not None:
+ out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+ if attr.shape.rank in [2, 3]:
+ out_shape[3] = attr.shape[-1].value
+
+ # Output pixel differentials for at least some attributes.
+ @tf.custom_gradient
+ def func_da(attr, rast, rast_db):
+ diff_attrs_all = int(diff_attrs == 'all')
+ diff_attrs_list = [] if diff_attrs_all else diff_attrs
+ out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+ # Infer number of channels in out_da.
+ if not diff_attrs_all:
+ da_channels = 2 * len(diff_attrs)
+ if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+ da_channels = 2 * attr.shape[-1].value
+ else:
+ da_channels = None
+
+ # Set output shapes.
+ out.set_shape(out_shape)
+ out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+ def grad(dy, dda):
+ return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+ return (out, out_da), grad
+
+ # No pixel differentials for any attribute.
+ @tf.custom_gradient
+ def func(attr, rast):
+ out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+ out.set_shape(out_shape)
+ out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+ def grad(dy, _):
+ return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+ return (out, out_da), grad
+
+ # Choose stub.
+ if diff_attrs:
+ return func_da(attr, rast, rast_db)
+ else:
+ return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+ assert tex_const is True or tex_const is False
+
+ # Default filter mode.
+ if filter_mode == 'auto':
+ filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+ # Known constant texture?
+ tex_const = tex_const or _is_constant(tex, np.float32)
+
+ # Sanitize inputs.
+ tex_const = 1 if tex_const else 0
+ if max_mip_level is None:
+ max_mip_level = -1
+ else:
+ max_mip_level = int(max_mip_level)
+ assert max_mip_level >= 0
+
+ # Convert inputs to tensors.
+ tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+ uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+ if 'mipmap' in filter_mode:
+ uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+ # Infer output shape.
+ out_shape = [None, None, None, None]
+ if uv.shape.rank is not None:
+ assert uv.shape.rank == 4
+ out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+ if tex.shape.rank is not None:
+ assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+ out_shape[-1] = tex.shape[-1].value
+
+ # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+ if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+ filter_mode = 'linear'
+
+ # Convert filter mode to internal enumeration.
+ filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+ filter_mode_enum = filter_mode_dict[filter_mode]
+
+ # Convert boundary mode to internal enumeration.
+ boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+ boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+ # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+ @tf.custom_gradient
+ def func_linear_mipmap_linear(tex, uv, uv_da):
+ out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+ out.set_shape(out_shape)
+ def grad(dy):
+ return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+ return out, grad
+
+ # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+ @tf.custom_gradient
+ def func_linear_mipmap_nearest(tex, uv):
+ out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+ out.set_shape(out_shape)
+ def grad(dy):
+ return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+ return out, grad
+
+ # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+ @tf.custom_gradient
+ def func_linear(tex, uv):
+ out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+ out.set_shape(out_shape)
+ def grad(dy):
+ return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+ return out, grad
+
+ # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+ @tf.custom_gradient
+ def func_nearest(tex):
+ out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+ out.set_shape(out_shape)
+ def grad(dy):
+ return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+ return out, grad
+
+ # Choose stub.
+ if filter_mode == 'linear-mipmap-linear':
+ return func_linear_mipmap_linear(tex, uv, uv_da)
+ elif filter_mode == 'linear-mipmap-nearest':
+ return func_linear_mipmap_nearest(tex, uv)
+ elif filter_mode == 'linear':
+ return func_linear(tex, uv)
+ elif filter_mode == 'nearest':
+ return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+ assert tri_const is True or tri_const is False
+
+ # Known constant triangles?
+ tri_const = tri_const or _is_constant(tri, np.int32)
+
+ # Convert inputs to tensors.
+ color = tf.convert_to_tensor(color, dtype=tf.float32)
+ rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+ pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+ tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+ # Sanitize inputs.
+ tri_const = 1 if tri_const else 0
+
+ @tf.custom_gradient
+ def func(color, pos):
+ color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+ color_out.set_shape(color.shape)
+ def grad(dy):
+ grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+ if pos_gradient_boost != 1.0:
+ grad_pos = grad_pos * pos_gradient_boost
+ return grad_color, grad_pos
+ return color_out, grad
+
+ return func(color, pos)
+
+#----------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
new file mode 100644
index 0000000..3918aec
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+ '''Set CUDA kernel compilation temp dir.
+
+ If `set_cache_dir` is not called, the cache directory will default to
+ one of the below:
+
+ - Value of NVDIFFRAST_CACHE_DIR env var, if set
+ - $HOME/.cache/nvdiffrast if HOME env var is set
+ - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+ Args:
+ path: Where to save CUDA kernel build temporaries
+ '''
+ global _nvdiffrast_cache_dir
+ _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+ if _nvdiffrast_cache_dir is not None:
+ return os.path.join(_nvdiffrast_cache_dir, *paths)
+ if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+ return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+ if 'HOME' in os.environ:
+ return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+ if 'USERPROFILE' in os.environ:
+ return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+ return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+ hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+ if hostx64_paths != []:
+ return hostx64_paths[0]
+ vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+ if os.path.isdir(vc_bin_dir):
+ return vc_bin_dir
+ return None
+
+def _get_compute_cap(device):
+ caps_str = device.physical_device_desc
+ m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+ major = m.group(1)
+ minor = m.group(2)
+ return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+ gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+ if len(gpus) == 0:
+ raise RuntimeError('No GPU devices found')
+ (major, minor) = _get_compute_cap(gpus[0])
+ return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+ with os.popen(cmd) as pipe:
+ output = pipe.read()
+ status = pipe.close()
+ if status is not None:
+ raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+ cmd = 'nvcc ' + opts.strip()
+ cmd += ' --disable-warnings'
+ cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+ cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+ cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+ cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+ compiler_bindir = _find_compiler_bindir()
+ if compiler_bindir is None:
+ # Require that _find_compiler_bindir succeeds on Windows. Allow
+ # nvcc to use whatever is the default on Linux.
+ if os.name == 'nt':
+ raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+ else:
+ cmd += ' --compiler-bindir "%s"' % compiler_bindir
+ cmd += ' 2>&1'
+ return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+ cuda_file_base = os.path.basename(cuda_file)
+ cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+ # Already in cache?
+ if cuda_file in _plugin_cache:
+ return _plugin_cache[cuda_file]
+
+ # Setup plugin.
+ if verbose:
+ print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+ try:
+ # Hash CUDA source.
+ md5 = hashlib.md5()
+ with open(cuda_file, 'rb') as f:
+ md5.update(f.read())
+ md5.update(b'\n')
+
+ # Hash headers included by the CUDA code by running it through the preprocessor.
+ if not do_not_hash_included_headers:
+ if verbose:
+ print('Preprocessing... ', end='', flush=True)
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+ _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+ with open(tmp_file, 'rb') as f:
+ bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+ good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+ for ln in f:
+ if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+ ln = ln.replace(bad_file_str, good_file_str)
+ md5.update(ln)
+ md5.update(b'\n')
+
+ # Select compiler options.
+ compile_opts = ''
+ if os.name == 'nt':
+ compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+ compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find libraries during compilation.
+ elif os.name == 'posix':
+ compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+ compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+ else:
+ assert False # not Windows or Linux, w00t?
+ compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+ compile_opts += ' --use_fast_math'
+ for opt in extra_nvcc_options:
+ compile_opts += ' ' + opt
+ nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+ # Hash build configuration.
+ md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+ md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+ md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+ # Compile if not already compiled.
+ bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+ cuda_cache_path = make_cache_dir_path()
+ bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+ if not os.path.isfile(bin_file):
+ if verbose:
+ print('Compiling... ', end='', flush=True)
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+ _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+ os.makedirs(cuda_cache_path, exist_ok=True)
+ intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+ shutil.copyfile(tmp_file, intermediate_file)
+ os.rename(intermediate_file, bin_file) # atomic
+
+ # Load.
+ if verbose:
+ print('Loading... ', end='', flush=True)
+ plugin = tf.load_op_library(bin_file)
+
+ # Add to cache.
+ _plugin_cache[cuda_file] = plugin
+ if verbose:
+ print('Done.', flush=True)
+ return plugin
+
+ except:
+ if verbose:
+ print('Failed!', flush=True)
+ raise
+
+#----------------------------------------------------------------------------
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
new file mode 100644
index 0000000..8eefcfb
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+#include "../common/glutil.cpp"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize_gl.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
diff --git a/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
new file mode 100644
index 0000000..9b14962
--- /dev/null
+++ b/LAM_gpro/external/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto. Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+ AntialiasKernelParams m_attribs;
+
+ AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+ {
+ memset(&m_attribs, 0, sizeof(m_attribs));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+ }
+
+ void Compute(OpKernelContext* ctx)
+ {
+ AntialiasKernelParams& p = m_attribs;
+ cudaStream_t stream = ctx->eigen_device().stream();
+
+ // Get input.
+ const Tensor& color = ctx->input(0);
+ const Tensor& rasterOut = ctx->input(1);
+ const Tensor& pos = ctx->input(2);
+ const Tensor& tri = ctx->input(3);
+
+ // Instance rendering mode?
+ p.instance_mode = pos.dims() > 2;
+
+ // Extract input dimensions.
+ if (p.instance_mode)
+ p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+ else
+ p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+ p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+ p.n = (color.dims() > 0) ? color.dim_size(0) : 0;
+ p.height = (color.dims() > 1) ? color.dim_size(1) : 0;
+ p.width = (color.dims() > 2) ? color.dim_size(2) : 0;
+ p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+ // Sanity checks.
+ OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+ OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+ OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+ OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+ if (p.instance_mode)
+ {
+ OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+ OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+ }
+ else
+ {
+ OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+ OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+ }
+
+ // Get input pointers.
+ p.color = color.flat().data();
+ p.rasterOut = rasterOut.flat().data();
+ p.tri = tri.flat().data();
+ p.pos = pos.flat().data();
+
+ // Misc parameters.
+ p.xh = .5f * (float)p.width;
+ p.yh = .5f * (float)p.height;
+
+ // Allocate output tensor.
+ Tensor* outputTensor = NULL;
+ TensorShape outputShape;
+ outputShape.AddDim(p.n);
+ outputShape.AddDim(p.height);
+ outputShape.AddDim(p.width);
+ outputShape.AddDim(p.channels);
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+ p.output = outputTensor->flat().data();
+
+ // Allocate work buffer. One extra int4 for storing counters.
+ Tensor* workTensor = NULL;
+ TensorShape workShape;
+ workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+ p.workBuffer = (int4*)(workTensor->flat().data());
+
+ // Clear the work counters.
+ OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+ // Verify that buffers are aligned to allow float2/float4 operations.
+ OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+ OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut & 7), errors::Internal("raster_out input tensor not aligned to float2"));
+ OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+ // Kernel parameters.
+ void* args[] = {&p};
+
+ // (Re-)calculate opposite vertex hash.
+ if (!p.evHash || !p.tri_const)
+ {
+ if (p.allocTriangles < p.numTriangles)
+ {
+ p.allocTriangles = max(p.allocTriangles, 64);
+ while (p.allocTriangles < p.numTriangles)
+ p.allocTriangles <<= 1; // Must be power of two.
+
+ // (Re-)allocate memory for the hash.
+ OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+ OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4)));
+ LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+ }
+
+ // Clear the hash and launch the mesh kernel to populate it.
+ OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4), stream));
+ OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+ }
+
+ // Copy input to output as a baseline.
+ OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+ // Choose launch parameters for the discontinuity finder kernel and launch.
+ dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+ dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+ OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+ // Determine optimum block size for the persistent analysis kernel.
+ int device = 0;
+ int numCTA = 0;
+ int numSM = 0;
+ OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+ OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+ OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+ // Launch analysis kernel.
+ OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+ }
+};
+
+REGISTER_OP("AntialiasFwd")
+ .Input ("color: float")
+ .Input ("raster_out: float")
+ .Input ("pos: float")
+ .Input ("tri: int32")
+ .Output ("output: float")
+ .Output ("work_buffer: int32")
+ .Attr ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+ AntialiasKernelParams m_attribs;
+
+ AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+ {
+ memset(&m_attribs, 0, sizeof(m_attribs));
+ }
+
+ void Compute(OpKernelContext* ctx)
+ {
+ AntialiasKernelParams& p = m_attribs;
+ cudaStream_t stream = ctx->eigen_device().stream();
+
+ // Get input.
+ const Tensor& color = ctx->input(0);
+ const Tensor& rasterOut = ctx->input(1);
+ const Tensor& pos = ctx->input(2);
+ const Tensor& tri = ctx->input(3);
+ const Tensor& dy = ctx->input(4);
+ const Tensor& workBuffer = ctx->input(5);
+
+ // Instance rendering mode?
+ p.instance_mode = pos.dims() > 2;
+
+ // Extract input dimensions.
+ if (p.instance_mode)
+ p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+ else
+ p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+ p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+ p.n = (color.dims() > 0) ? color.dim_size(0) : 0;
+ p.height = (color.dims() > 1) ? color.dim_size(1) : 0;
+ p.width = (color.dims() > 2) ? color.dim_size(2) : 0;
+ p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+ // Sanity checks.
+ OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+ OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+ OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+ OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+ OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+ OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+ if (p.instance_mode)
+ {
+ OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+ OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+ OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+ }
+ else
+ {
+ OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+ OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+ OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+ }
+
+ // Get input pointers.
+ p.dy = dy.flat().data();
+ p.color = color.flat().data();
+ p.rasterOut = rasterOut.flat().data();
+ p.tri = tri.flat