-
Notifications
You must be signed in to change notification settings - Fork 75
Expand file tree
/
Copy pathriflex_utils.py
More file actions
89 lines (78 loc) · 3.68 KB
/
riflex_utils.py
File metadata and controls
89 lines (78 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import torch
import numpy as np
from typing import Union,Optional
def get_1d_rotary_pos_embed_riflex(
dim: int,
pos: Union[np.ndarray, int],
theta: float = 10000.0,
use_real=False,
k: Optional[int] = None,
L_test: Optional[int] = None,
):
"""
RIFLEx: Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
data type.
Args:
dim (`int`): Dimension of the frequency tensor.
pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
theta (`float`, *optional*, defaults to 10000.0):
Scaling factor for frequency computation. Defaults to 10000.0.
use_real (`bool`, *optional*):
If True, return real part and imaginary part separately. Otherwise, return complex numbers.
k (`int`, *optional*, defaults to None): the index for the intrinsic frequency in RoPE
L_test (`int`, *optional*, defaults to None): the number of frames for inference
Returns:
`torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
"""
assert dim % 2 == 0
if isinstance(pos, int):
pos = torch.arange(pos)
if isinstance(pos, np.ndarray):
pos = torch.from_numpy(pos) # type: ignore # [S]
freqs = 1.0 / (
theta ** (torch.arange(0, dim, 2, device=pos.device)[: (dim // 2)].float() / dim)
) # [D/2]
# === Riflex modification start ===
# Reduce the intrinsic frequency to stay within a single period after extrapolation (see Eq. (8)).
# Empirical observations show that a few videos may exhibit repetition in the tail frames.
# To be conservative, we multiply by 0.9 to keep the extrapolated length below 90% of a single period.
if k is not None:
freqs[k-1] = 0.9 * 2 * torch.pi / L_test
# === Riflex modification end ===
freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
if use_real:
freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
return freqs_cos, freqs_sin
else:
# lumina
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]
return freqs_cis
def identify_k( b: float, d: int, N: int):
"""
This function identifies the index of the intrinsic frequency component in a RoPE-based pre-trained diffusion transformer.
Args:
b (`float`): The base frequency for RoPE.
d (`int`): Dimension of the frequency tensor
N (`int`): the first observed repetition frame in latent space
Returns:
k (`int`): the index of intrinsic frequency component
N_k (`int`): the period of intrinsic frequency component in latent space
Example:
In HunyuanVideo, b=256 and d=16, the repetition occurs approximately 8s (N=48 in latent space).
k, N_k = identify_k(b=256, d=16, N=48)
In this case, the intrinsic frequency index k is 4, and the period N_k is 50.
"""
# Compute the period of each frequency in RoPE according to Eq.(4)
periods = []
for j in range(1, d // 2 + 1):
theta_j = 1.0 / (b ** (2 * (j - 1) / d))
N_j = round(2 * torch.pi / theta_j)
periods.append(N_j)
# Identify the intrinsic frequency whose period is closed to N(see Eq.(7))
diffs = [abs(N_j - N) for N_j in periods]
k = diffs.index(min(diffs)) + 1
N_k = periods[k-1]
return k, N_k