btekin.github.io/index.html at master · btekin/btekin.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
  <head>
  <meta name=viewport content=“width=800”>
  <meta name="generator" content="HTML Tidy for Linux/x86 (vers 11 February 2007), see www.w3.org">
  <style type="text/css">
    /* Color scheme stolen from Sergey Karayev */
    a {
    color: #1772d0;
    text-decoration:none;
    }
    a:focus, a:hover {
    color: #f09228;
    text-decoration:none;
    }
    body,td,th,tr,p,a {
    font-family: 'Lato', Verdana, Helvetica, sans-serif;
    font-size: 14px
    }
    strong {
    font-family: 'Lato', Verdana, Helvetica, sans-serif;
    font-size: 14px;
    }
    heading {
    font-family: 'Lato', Verdana, Helvetica, sans-serif;
    font-size: 22px;
    }
    papertitle {
    font-family: 'Lato', Verdana, Helvetica, sans-serif;
    font-size: 14px;
    font-weight: 700
    }
    name {
    font-family: 'Lato', Verdana, Helvetica, sans-serif;
    font-size: 32px;
    }
    .one
    {
    width: 160px;
    height: 160px;
    position: relative;
    }
    .two
    {
    width: 160px;
    height: 160px;
    position: absolute;
    transition: opacity .2s ease-in-out;
    -moz-transition: opacity .2s ease-in-out;
    -webkit-transition: opacity .2s ease-in-out;
    }
    .fade {
     transition: opacity .2s ease-in-out;
     -moz-transition: opacity .2s ease-in-out;
     -webkit-transition: opacity .2s ease-in-out;
    }
    span.highlight {
        background-color: #ffffd0;
    }
  </style>
  <title>Bu&#287ra Tekin</title>
  <meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
  <link href='http://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic' rel='stylesheet' type='text/css'>
  </head>
  <body>
  <table width="800" border="0" align="center" cellspacing="0" cellpadding="0">
    <tr>
    <td>
      <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td width="80%" valign="middle">
        <p align="center">
          <name>Bu&#287ra Tekin</name>
        </p>
        <p align="justify">I am a Research Scientist at the Meta Reality Labs since 2022. Before that, I spent almost 4 years at the <a href="https://www.microsoft.com/en-us/research/lab/mixed-reality-ai-zurich/">Microsoft Mixed Reality & AI Lab</a> in Z&#252;rich. I received my PhD degree at the <a href="http://cvlab.epfl.ch">Computer Vision Laboratory</a> of <a href="http://epfl.ch">EPFL</a> under the supervision of <a href="http://people.epfl.ch/cgi-bin/people?id=112366&op=bio&lang=en&cvlang=en">Prof. Pascal Fua</a> and <a href="http://www.labri.fr/perso/vlepetit/">Prof. Vincent Lepetit</a>. Before that, I obtained my MS degree in Electrical Engineering from <a href="http://epfl.ch">EPFL</a> in 2013, and BS degree in Electrical & Electronics Engineering from <a href="http://www.boun.edu.tr/en-US/Index">Bogazici University</a> in 2011 with high honors. I also spent time at <a href="https://www.microsoft.com/en-us/research/">Microsoft Research</a> as a research intern and at <a href="https://ethz.ch/en.html">ETH Z&#252;rich</a> as a visiting researcher. I am the recipient of <a href="https://www.qualcomm.com/invention/research/university-relations/innovation-fellowship">Qualcomm Innovation Fellowship Europe</a> in 2017.
        </p>
        <p align=center>
          <a href="mailto:tekinbu@gmail.com">Email</a> &nbsp/&nbsp
          <a href="https://scholar.google.ch/citations?user=3fa02HAAAAAJ&hl=en">Google Scholar</a> &nbsp/&nbsp
          <a href="https://www.linkedin.com/in/bugratekin/"> LinkedIn </a>
        </p>
        </td>
        <td width="33%", align=right>
        <img src="photo_tekin.jpeg">

      </tr>
      </table>
      <table width="100%" align="justify" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td width="100%" valign="middle">
          <heading>Research</heading>
          <p align=justify>
          I'm interested in computer vision, machine learning, deep learning, image processing, and augmented reality. Much of my research is about semantically understanding humans and objects from the camera images in the 3D world. Particularly, I work on video understanding, multimodal language models, 2D/3D human pose estimation, hand pose estimation, action recognition, human-object interactions and 6D object pose estimation. In the past, I have also worked in biomedical imaging.
          </p>
        </td>
      </tr>
      </table>

      <table width="100%" align="justify" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td width="100%" valign="middle">
          <heading>Publications</heading>
        </td>
      </tr>
      </table>


  <table width="100%" align="justify" border="0" cellspacing="0" cellpadding="20">

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='palm.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2511.05403">
        <papertitle>PALM: A Dataset and Baseline for Learning Multi-subject Hand Prior</papertitle></a><br>
        Zicong Fan, Edoardo Remelli, David Dimond, Fadime Sener, Liuhao Ge, <strong>Bugra Tekin</strong>, Cem Keskin, Shreyas Hampali <br>
        <em>International Conference on 3D Vision (3DV)</em>, 2026 <br>
        <p></p>
        <p align=justify> We present PALM, a large-scale hand dataset, along with PALM-Net, a physically based model enabling realistic single-image hand avatar personalization.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='providllm.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2504.13915">
        <papertitle>Memory-efficient Streaming VideoLLMs for Real-Time Procedural Video Understanding</papertitle></a><br>
        Dibyadip Chatterjee, Edoardo Remelli, Yale Song, <strong>Bugra Tekin</strong>, Abhay Mittal, Bharat Bhatnagar, Necati Cihan Camgoz, Shreyas Hampali, Eric Sauser, Shugao Ma, Angela Yao, Fadime Sener <br>
        <em>International Conference on Computer Vision (ICCV)</em>, 2025 <br>
        <p></p>
        <p align=justify> A streaming video large language model for real-time procedural video tasks with a low memory footprint.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='hoigpt.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2503.19157">
        <papertitle>HOIGPT: Learning Long Sequence Hand-Object Interaction with Language Models</papertitle></a><br>
        Mingzhen Huang, Fu-Jen Chu, <strong>Bugra Tekin</strong>, Kevin Liang, Haoyu Ma, Weiyao Wang, Xingyu Chen, Pierre Gleize, Hongfei Xue, Siwei Lyu, Kris Kitani, Matt Feiszli, Hao Tang <br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2025 <br>
        <p></p>
        <p align=justify> The first token-based generative model to unify both the understanding and generation of 3D hand-object interactions (HOI).
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='humocon.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://openaccess.thecvf.com/content/CVPR2025/html/Fang_HuMoCon_Concept_Discovery_for_Human_Motion_Understanding_CVPR_2025_paper.html">
        <papertitle>HuMoCon: Concept Discovery for Human Motion Understanding</papertitle></a><br>
        Qihang Fang, Chengcheng Tang, <strong>Bugra Tekin</strong>, Shugao Ma, Yanchao Yang <br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2025 <br>
        <p></p>
        <p align=justify> For human action understanding with video-motion input, we propose HuMoCon, a framework designed to tackle feature misalignment and high-frequency information loss. HuMoCon enables effective motion concept discovery and enhances accuracy in Question Answering tasks.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='gotrack.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://www.arxiv.org/pdf/2506.07155">
        <papertitle>GoTrack: Generic 6DoF Object Pose Refinement and Tracking</papertitle></a><br>
        Van Nguyen Nguyen, Christian Forster, Sindi Shkodrani, Vincent Lepetit, <strong>Bugra Tekin</strong>, Cem Keskin, Tomas Hodan<br>
        <em>Computer Vision and Patern Recognition Workshops (CVPRW)</em>, 2025 <br>
        <p></p>
        <p align=justify> We introduce GoTrack, an efficient and accurate CAD-based method for 6DoF object pose refinement and tracking, which can handle diverse objects without any object-specific training.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='diffh2o_teaser.gif' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2403.17827.pdf">
        <papertitle>DiffH2O: Diffusion-Based Synthesis of Hand-Object Interactions from Textual Descriptions</papertitle></a><br>
        Sammy Christen, Shreyas Hampali, Fadime Sener, Edoardo Remelli, Tomas Hodan, Eric Sauser, Shugao Ma, <strong>Bugra Tekin</strong><br>
        <em>SIGGRAPH Asia</em>, 2024 <br>
        <p></p>
        <p align=justify> We introduce DiffH2O, a diffusion-based framework to synthesize dexterous hand-object interactions. DiffH2O generates realistic hand-object motion from natural language, generalizes to unseen objects at test time and enables fine-grained control over the motion with detailed textual descriptions
        </p>
        <p></p>
      </td>
    </tr>


    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='cigtime.png' width="160" height="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://btekin.github.io">
        <papertitle>CigTime: Corrective Instruction Generation Through Inverse Motion Editing</papertitle></a><br>
        Qihang Fang, Chengcheng Tang, <strong>Bugra Tekin</strong>, Yanchao Yang<br>
        <em> Neural Information Processing Systems (NeurIPS)</em>, 2024 <br>
        <p></p>
        <p align=justify> We introduce a novel task and system for automated coaching and feedback on human motion, aimed at generating corrective instructions and guidance for body posture and movement during specific tasks.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='foundpose_thumb.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2311.18809.pdf">
        <papertitle>FoundPose: Unseen Object Pose Estimation with Foundation Features</papertitle></a><br>
        Evin Pınar Örnek, Yann Labbé, <strong>Bugra Tekin</strong>, Lingni Ma, Cem Keskin, Christian Forster, Tomas Hodan<br>
        <em>European Conference on Computer Vision (ECCV)</em>, 2024 <br>
        <p></p>
        <p align=justify> A method for 6D pose estimation of unseen rigid objects from a single RGB image without any object-specific training.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='xmic.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2403.19811v1">
        <papertitle>X-MIC: Cross-Modal Instance Conditioning for Egocentric Action Generalization</papertitle></a><br>
        Anna Kukleva, Fadime Sener, Edoardo Remelli, <strong>Bugra Tekin</strong>, Eric Sauser , Bernt Schiele, Shugao Ma<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2024 <br>
        <p></p>
        <p align=justify> A simple yet effective cross-modal adaptation framework for VLMs.
        </p>
        <p></p>
      </td>
    </tr>


    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='holoassist.png' width="160" >
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2309.17024">
        <papertitle>HoloAssist: An Egocentric Human Interaction Dataset for Interactive AI Assistants in the Real World</papertitle></a><br>
        Xin Wang*, Taein Kwon*, Mahdi Rad, Bowen Pan, Ishani Chakraborty, Sean Andrist, Dan Bohus, Ashley Fanello,  <strong>Bugra Tekin</strong>, Felipe Vieira Frujeri, Neel Joshi, Marc Pollefeys<br>
        <em>International Conference on Computer Vision (ICCV)</em>, 2023 <br>
        <p></p>
        <p align=justify> HoloAssist is a large-scale egocentric human interaction dataset, where two people collaboratively complete physical manipulation tasks. By augmenting the data with action and conversational annotations and observing the rich behaviors of various participants, we present key insights into how human assistants correct mistakes, intervene in the task completion procedure, and ground their instructions to the environment.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src="casa.png" />
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2204.12223.pdf">
        <papertitle>Context-Aware Sequence Alignment using 4D Skeletal Augmentation</papertitle></a><br>
        Taein Kwon,  <strong>Bugra Tekin</strong>, Siyu Tang, Marc Pollefeys<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2022 <strong>(Oral)</strong> <br>
        <p></p>
        <p align=justify> We propose a skeletal self-supervised learning approach that uses alignment as a pretext task. Our approach to alignment relies on a context-aware attention model that incorporates spatial and temporal context within and across sequences and a contrastive learning formulation that relies on 4D skeletal augmentations. Pose data provides a valuable cue for alignment and downstream tasks, such as phase classification and phase progression, as it is robust to different camera angles and changes in the background, while being efficient for real-time processing.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src="vava.gif"
         width="160"
         height="118" />
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2111.09301.pdf">
        <papertitle>Learning to Align Sequential Actions in the Wild</papertitle></a><br>
        Weizhe Liu,  <strong>Bugra Tekin</strong>, Huseyin Coskun, Vibhav Vineet, Pascal Fua, Marc Pollefeys<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2022 <br>
        <p></p>
        <p align=justify> We propose an approach to align sequential actions in the wild that involve diverse temporal variations. To this end, we present a new method to enforce temporal priors on the optimal transport matrix, which leverages temporal consistency, while allowing for variations in the order of actions. Our model accounts for both monotonic and non-monotonic sequences and handles background frames that should not be aligned. We demonstrate that our approach consistently outperforms the state-of-the-art in self-supervised sequential action representation learning.
        </p>
        <p></p>
      </td>
    </tr>


    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='h2o.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2104.11181.pdf">
        <papertitle>H2O: Two Hands Manipulating Objects for First Person Interaction Recognition</papertitle></a><br>
        Taein Kwon,  <strong>Bugra Tekin</strong>, Jan Stuehmer, Federica Bogo, Marc Pollefeys<br>
        <em>International Conference on Computer Vision (ICCV)</em>, 2021<br>
        <a href ="https://www.taeinkwon.com/projects/h2o">project</a>
        <p></p>
        <p align=justify> In this paper, we propose a method to collect a dataset of two hands manipulating objects for first person interaction recognition. We provide a rich set of annotations including action labels, object classes, 3D left & right hand poses, 6D object poses, camera poses and scene point clouds. We further propose the first method to jointly recognize the 3D poses of two hands manipulating objects and a novel topology-aware graph convolutional network for recognizing hand-object interactions.
        </p>
        <p></p>
      </td>
    </tr>


    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='lowshotar.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/1907.09382.pdf">
        <papertitle>Domain-Specific Priors and Meta Learning for Low-shot First-Person Action Recognition</papertitle></a><br>
        Huseyin Coskun, Zeeshan Zia, <strong>Bugra Tekin</strong>, Federica Bogo, Nassir Navab, Federico Tombari, Harpreet Sawhney<br>
        <em>Pattern Analysis and Machine Intelligence (PAMI)</em>, 2021<br>
        <p></p>
        <p align=justify> We develop an effective method for low-shot transfer learning for first-person action classification. We leverage independently trained local visual cues to learn representations that can be transferred from a source domain providing primitive action labels to a target domain with only a handful of examples.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='grounding_instructional_videos.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2109.04409.pdf">
        <papertitle>Reconstructing and grounding narrated instructional videos in 3D</papertitle></a><br>
        Dimtri Zhukov, Ignacio Rocco, Ivan Laptev, Josef Sivic, Johannes L. Schoenberger, <strong>Bugra Tekin</strong>, Marc Pollefeys<br>
        <em>arXiv preprint arXiv:2109.04409</em>, 2021<br>
        <p></p>
        <p align=justify> We present a method for 3D reconstruction of instructional videos and localizing the associated narrations in 3D. Our method is resistant to the differences in appearance of objects depicted in the videos and computationally efficient.
        </p>
        <p></p>
      </td>
    </tr>


    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='cvpr20yana.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2004.13449.pdf">
        <papertitle>Leveraging Photometric Consistency over Time for Sparsely Supervised Hand-Object Reconstruction</papertitle></a><br>
        Yana Hasson, <strong>Bugra Tekin</strong>, Federica Bogo, Ivan Laptev, Marc Pollefeys, Cordelia Schmid<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2020 <br>
        <a href ="https://github.com/hassony2/handobjectconsist">code</a>
        <p></p>
        <p align=justify>In this paper, we propose a new method for dense 3D reconstruction of hands and objects from monocular color images. We further present a self-supervised learning approach leveraging photo-consistency between sparsely supervised frames.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='hl2rm.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/2008.11239.pdf">
        <papertitle>HoloLens 2 Research Mode as a Tool for Computer Vision Research</papertitle></a><br>
        Dorin Ungureanu, Federica Bogo, Silvano Galliani, Pooja Sama, Xin Duan, Casey Meekhof, Jan Stuhmer, Thomas Cashman, <strong>Bugra Tekin</strong>, Johannes L. Schoenberber, Pawel Olszta, Marc Pollefeys<br>
        <em>Tech Report</em>, 2020 <br>
        <a href ="https://github.com/microsoft/HoloLens2ForCV">code</a>
        <p></p>
        <p align=justify>We present HoloLens 2 Research Mode, an API anda  set  of  tools  enabling  access  to  the  raw  sensor  streams. We  provide  an  overview  of  the  API  and  explain  how  it can  be  used  to  build  mixed  reality  applications  based  onprocessing sensor data. We also show how to combine theResearch Mode sensor data with the built-in eye and handtracking capabilities provided by HoloLens 2.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='advgp.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://openaccess.thecvf.com/content/ACCV2020/papers/Zhou_Reconstructing_Human_Body_Mesh_from_Point_Clouds_by_Adversarial_GP_ACCV_2020_paper.pdf">
        <papertitle>Reconstructing Human Body Mesh from Point Clouds by Adversarial GP Network</papertitle></a><br>
        Boyao Zhou, Jean-Sebastian Franco, Federica Bogo, <strong>Bugra Tekin</strong>, Edmond Boyer<br>
        <em>Asian Conference on Computer Vision (ACCV)</em>, 2020 <br>
        <p></p>
        <p align=justify>We study the problem of reconstructing the template-aligned mesh for human body estimation from unstructured point cloud data and propose a new dedicated human template matching process with a point-based deep-autoencoder architecture, where  consistency of surface points is enforced and parameterized with a specialized Gaussian Process layer, and whose global consistency and generalization abilities are enforced with adversarial training.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='handplusobject.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/1904.05349.pdf">
        <papertitle>H+O: Unified Egocentric Recognition of 3D Hand-Object Poses and Interactions</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Federica Bogo, Marc Pollefeys<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2019 <strong>(Oral)</strong> <br>
        <p></p>
        <p align=justify>In this work,  we propose, for the first time, a unified method to jointly recognize 3D hand and object poses, and their interactions from egocentric monocular color images. Our method jointly estimates the hand and object poses in 3D, models their interactions and recognizes the object and activity classes  with a single feed-forward pass through a neural network.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='singleshotpose.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Tekin_Real-Time_Seamless_Single_CVPR_2018_paper.pdf">
        <papertitle>Real Time Seamless Single Shot 6D Object Pose Prediction</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Sudipta N. Sinha, Pascal Fua<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2018 <br>
        <a href ="http://openaccess.thecvf.com/content_cvpr_2018/Supplemental/3117-supp.pdf">supplementary</a>
        /
        <a href ="https://github.com/Microsoft/singleshotpose">code</a>
        <p></p>
        <p align=justify>We introduce a new deep learning architecture that naturally extends the single-shot 2D object detection paradigm to 6D object pose estimation. It demonstrates state-of-the-art accuracy with real-time performance and is at least 5 times faster than the existing methods (50 to 94 fps depending on the input resolution). </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='latentpose.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://infoscience.epfl.ch/record/252823/files/main_paper.pdf">
        <papertitle>Learning Latent Representations of 3D Human Pose with Deep Neural Networks</papertitle></a><br>
        Isinsu Katircioglu*, <strong>Bugra Tekin*</strong>, Mathieu Salzmann, Vincent Lepetit, Pascal Fua<br>
        <em>International Journal of Computer Vision (IJCV)</em>, 2018 <br>
        <p></p>
        <p align=justify>We propose an efficient Long-Short-Term-Memory (LSTM) network for enforcing consistency of 3D human pose predictions across temporal windows.
        </p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='fusion.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/1611.05708v1.pdf">
        <papertitle>Learning to Fuse 2D and 3D Image Cues for Monocular Body Pose Estimation</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Pablo Marquez-Neila, Mathieu Salzmann Pascal Fua<br>
        <em>International Conference on Computer Vision (ICCV)</em>, 2017 <br>
        <a href ="https://infoscience.epfl.ch/record/230311/files/TekinEtAlICCV17Supp.zip">supplementary</a>
        /
        <a href ="https://drive.switch.ch/index.php/s/jvPwlyJUb4lxR0M">code</a>
        /
        <a href ="https://cvlab.epfl.ch/research/surv/human-pose-estimation">project</a>
        <p></p>
        <p align=justify>We introduce an approach to learn where and how to fuse the streams of a two-stream convolutional neural network operating on different input modalities for 3D human pose estimation.</p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='hardfusion.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/1611.05708v1.pdf">
        <papertitle>Fusing 2D Uncertainty and 3D Cues for Monocular Body Pose Estimation</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Pablo Marquez-Neila, Mathieu Salzmann, Pascal Fua<br>
        <em>arXiv Preprint, arXiv:1611.05708</em>, 2016 <br>
        <a href ="https://cvlab.epfl.ch/research/surv/human-pose-estimation">project</a>
        <p></p>
        <p align=justify>We propose to jointly model 2D uncertainty and leverage 3D image cues in a regression framework for reliable monocular 3D human pose estimation.
        </p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='autoenc.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://infoscience.epfl.ch/record/220616/files/tekin_bmvc16.pdf">
        <papertitle>Structured Prediction of 3D Human Pose with Deep Neural Networks</papertitle></a><br>
        <strong>Bugra Tekin*</strong>, Isinsu Katircioglu*, Mathieu Salzmann, Vincent Lepetit, Pascal Fua<br>
        <em>British Machine Vision Conference (BMVC)</em>, 2016 <strong>(Oral)</strong> <br>
        <p></p>
        <p align=justify>We introduce a Deep Learning regression architecture for structured prediction of 3D human pose from monocular images that relies on an overcomplete auto-encoder to learn a high-dimensional latent pose representation and account for joint dependencies.
        </p>
        <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='rstv.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Tekin_Direct_Prediction_of_CVPR_2016_paper.pdf">
        <papertitle>Direct Prediction of 3D Body Poses from Motion Compensated Sequences</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Artem Rozantsev, Vincent Lepetit, Pascal Fua<br>
        <em>Computer Vision and Pattern Recognition (CVPR)</em>, 2016<br>
        <a href ="https://cvlab.epfl.ch/research/surv/human-pose-estimation">project</a>
        <p></p>
        <p align=justify>We propose to predict the 3D human pose from a spatiotemporal volume of bounding boxes. We further propose a CNN-based motion compensation method that increases the stability and reliability of our 3D pose estimates.
      </p>
      <p></p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='stfeatures.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://arxiv.org/pdf/1504.08200v1.pdf">
        <papertitle>Predicting People's 3D Poses from Short Sequences</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Xiaolu Sun, Xinchao Wang, Vincent Lepetit, Pascal Fua<br>
        <em>arXiv Preprint, arXiv:1504.08200</em>, 2015<br>
        <p></p>
        <p align=justify>We propose an efficient approach to exploiting motion information from consecutive frames of a video sequence to recover the 3D pose of people. Instead of computing candidate poses in individual frames and then linking them, as is often done, we regress directly from a spatio-temporal block of frames to a 3D pose in the central one.
      </p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='separable.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://infoscience.epfl.ch/record/200142/files/separable_filters_learning_1.pdf">
        <papertitle>Learning Separable Filters</papertitle></a><br>
        Amos Sironi*, <strong>Bugra Tekin*</strong>, Roberto Rigamonti, Vincent Lepetit, Pascal Fua<br>
        <em>Pattern Analysis and Machine Intelligence (PAMI)</em>, 2014<br>
        <a href ="https://infoscience.epfl.ch/record/200142/files/appendix.pdf">supplementary</a>
        /
        <a href ="https://bitbucket.org/bugratekin/learning_2d_separable_filters_sep_td/src">code 2D</a>
        /
        <a href ="https://bitbucket.org/bugratekin/learning_2d_separable_filters_sep_td/src">code 3D</a>
        <p></p>
        <p align=justify> We introduce an efficient approach to approximate a set of nonseparable convolutional filters by linear combinations of a smaller number of separable ones. We demonstrate that this greatly reduces the computational complexity at no cost in terms of performance for image recognition tasks with convolutional filters and CNNs.
      </p>
      </td>
    </tr>

    <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%">
        <img src='steerable.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="http://bigwww.epfl.ch/publications/tekin1301.pdf">
        <papertitle>Benefits of Consistency in Image Denoising with Steerable Wavelets</papertitle></a><br>
        <strong>Bugra Tekin</strong>, Ulugbek Kamilov, Emrah Bostan, Michael Unser<br>
        <em>International Conference on Acoustics, Speech and Signal Processing (ICASSP)</em>, 2013 <strong>(Oral)</strong> <br>
        <p></p>
        <p align=justify> We propose a technique for improving the performance of L1-based image denoising in the steerable wavelet domain. Our technique, which we call consistency, refers to the fact that the solution obtained by the algorithm is constrained to the space spanned by the basis functions of the transform, which results in a certain norm equivalence between image-domain and wavelet-domain estimations.
      </p>
      </td>
    </tr>

      <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td width="100%" valign="middle">
          <p>
          (*: indicates equal contribution)
          </p>
        </td>
      </tr>
      </table>

      <table width="100%" align="justify" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td width="100%" valign="middle">
          <heading>Theses</heading>
        </td>
      </tr>
      </table>

      <table width="100%" align="justify" border="0" cellspacing="0" cellpadding="20">
      <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%" valign="top">
        <img src='phdthesiscover.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://infoscience.epfl.ch/record/256865/files/EPFL_TH8753.pdf">
        <papertitle>Learning Robust Features and Latent Representations for Single View 3D Pose Estimation of Humans and Objects</papertitle></a><br>
        Bugra Tekin<br>
        <em>Ph.D. Thesis </em>, September 2018 <br>
        <p></p>
      </td>
      </tr>
      <tr onmouseout="ffcc_stop()" onmouseover="ffcc_start()" >
      <td width="25%" valign="top">
        <img src='masterthesiscover.png'>
      </td>
      <td valign="top" width="75%">
        <p><a href="https://infoscience.epfl.ch/record/229999/files/ms_thesis_tekin.pdf">
        <papertitle>Learning Separable Filters with Shared Parts</papertitle></a><br>
        Bugra Tekin<br>
        <em>M.Sc. Thesis </em>, June 2013 <br>
        <p></p>
      </td>
      </tr>
      </table>

    <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td>
        <heading>Patents</heading>
        </td>
      </tr>
      </table>
      <table width="100%" align="center" border="0" cellpadding="20">
      <tr>
        <td width="25%"><img src="patent.jpeg" alt="nnteaching" width="160" height="160"></td>
        <td width="75%" valign="center">
        <p>
          <a href="https://patents.google.com/patent/US20230244316A1/en">Gesture recognition based on likelihood of interaction</a> <br>
          US Patent App. 17/649,659
        </p>
        <p>
          <a href="https://patents.google.com/patent/US20230019745A1/en">Multi-modal sensor based process tracking and guidance</a> <br>
          US Patent App. 17/377,152
        </p>
        <p>
          <a href="https://patents.google.com/patent/US20220230079A1/en">Action recognition</a> <br>
          US Patent App. 17/155,013
        </p>
        <p>
          <a href="https://patents.google.com/patent/US11106949B2/en">Action classification based on manipulated object movement</a> <br>
          US Patent 11,106,949
        </p>
        <p>
          <a href="https://patents.google.com/patent/US11004230B2/en">Predicting three-dimensional articulated and target object pose</a> <br>
          US Patent 11,004,230
        </p>
        <p>
          <a href="https://patents.google.com/patent/US20200311396A1/en">Spatially consistent representation of hand motion</a> <br>
          US Patent App. 16/363,964
        </p>
        <p>
          <a href="https://patents.google.com/patent/US20170316578A1/en">Method, System and Device for Direct Prediction of 3D Body Poses from Motion Compensated Sequence</a> <br>
          US Patent App. US 2017-0316578 A1
        </p>
        </td>
      </tr>
      </table>


    <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td>
        <heading>Teaching</heading>
        </td>
      </tr>
      </table>
      <table width="100%" align="center" border="0" cellpadding="20">
      <tr>
        <td width="25%"><img src="nnteaching.png" alt="nnteaching" width="160" height="160"></td>
        <td width="75%" valign="center">
        <p>
          Deep Learning, TA, 2018
        </p>
        <p>
          Computer Vision, TA, 2016, 2017
        </p>
        <p>
          Numerical Methods for Visual Computing, TA, 2016
        </p>
        <p>
          Programmation (C/C++) / (Java), TA, 2013, 2015
        </p>
        <p>
          Principles of Digital Communications, TA, 2013
        </p>
        <p>
          Circuits and Systems I/II, TA, 2011, 2012, 2013
        </p>
        </td>
      </tr>
      </table>
      <table width="100%" align="center" border="0" cellspacing="0" cellpadding="20">
      <tr>
        <td>
        <br>
        <p align="right">
          <font size="2">
          <a href="https://forvo.com/word/bu%C4%9Fra/#tr"><strong>pronunciation of my name, Bu&#287;ra</strong></a> / website template from <a href="https://jonbarron.info/"><strong>Jon Barron</strong></a>
      </font>
        </p>
        </td>
      </tr>
      </table>
      <script type="text/javascript">
      var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
          document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));

      </script> <script type="text/javascript">
      try {
          var pageTracker = _gat._getTracker("UA-7580334-1");
          pageTracker._trackPageview();
          } catch(err) {}
      </script>
    </td>
    </tr>
  </table>
  </body>
</html>