FFT与游戏开发(五)
先放结果展示:
海浪的模拟,可以理解为一堆任意方向的正弦波的叠加,这些正弦波的频谱(相位和振幅)会随着时间而变化。
\[h(\overrightarrow x, t) = \sum_{\overrightarrow k} \tilde h (\overrightarrow k, t) e^{j \overrightarrow k \cdot \overrightarrow x} \]
\[\begin{aligned} 其中: &L_m, L_n为整个M*N个网格的边长 \\ \overrightarrow k &= (k_x, k_z) \\ k_x &= \frac{2\pi m}{L_m}, -\frac{M}{2} \leq m < \frac{M}{2} \\ k_z &= \frac{2\pi n}{L_n}, -\frac{N}{2} \leq n < \frac{N}{2} \\ \overrightarrow x &= \left( \frac{xL_m}{M}, \frac{yL_n}{N} \right) \\ & 0 \leq x < M \\ & 0 \leq y < N \\ \end{aligned} \]
这里需要说明下,这个式子和标准2D iDFT长得不太一样:
\[x(x, y) = \frac{1}{MN} \sum_{m=0}^{M-1}\sum_{n=0}^{N-1} X(m, n) e^{j2\pi (\frac{mx}{M} + \frac{ny}{N})} \]
这是因为:
- \(\overrightarrow k \cdot \overrightarrow x\)的结果就是$ 2\pi (\frac{mx}{M} + \frac{ny}{N}) $这里和公式是一样的
- 求和的范围不一样,标准公式是从0-M、0-N,而这里是\(-\frac{N}{2} \leq m < \frac{N}{2}\)
和
\(-\frac{N}{2} \leq n < \frac{N}{2}\)这里推导下
\[设\overrightarrow k = \left( \frac{2\pi m - \pi M}{L_m}, \frac{2\pi N - \pi N}{L_n} \right); 0\leq m < M, 0\leq n < N \]
\[\begin{aligned} h(x, y, t) &= \frac{1}{MN} \sum_{m=0}^{M-1} \sum_{n=0}^{N-1} \tilde h(m,n,t) \exp\left( j \overrightarrow k \cdot \overrightarrow x \right) \\ &= \frac{1}{MN} \sum_{m=0}^{M-1} \sum_{n=0}^{N-1} \tilde h(m,n,t) \exp\left( j \frac{2\pi mx - \pi Mx}{M} + j \frac{2\pi ny - \pi Ny}{N} \right) \\ &= \frac{1}{MN} \sum_{m=0}^{M-1} \sum_{n=0}^{N-1} \tilde h(m,n,t) \exp\left( j2\pi \left( \frac{mx}{M} + j \frac{ny}{N} \right) \right) \exp{(-j\pi(x+y))} \\ &= (-1)^{x+y} \frac{1}{MN} \sum_{m=0}^{M-1} \sum_{n=0}^{N-1} \tilde h(m,n,t) \exp\left( j2\pi \left( \frac{mx}{M} + j \frac{ny}{N} \right) \right) \\ \end{aligned} \]
最后相当于是在标准的iFFT上面乘了一个\((-1)^{x+y}\),之前程序没有这么写,直觉上L的参数总是不太对,L比较大的时候,海浪应该比较密集,L比较小的时候,比较平缓,改成这样就符合直觉了。
飞利浦频谱(Phillips Spectrum)
我们有了从海浪频域转到海浪时域的工具,那么海浪的频域是怎么来的呢?通过海洋统计学,可以得到海浪的频谱随着时间变化的函数:
\[\tilde h(\overrightarrow k, t) = \tilde{h_0}(\overrightarrow k) e^{it\sqrt{gk}} + \tilde{h_0^*}(-\overrightarrow k) e^{-it\sqrt{gk}} \begin{aligned} 其中: &L为整个N*N个网格的边长 \\ &k_x = \frac{2\pi m}{L}, -\frac{N}{2} \leq m < \frac{N}{2}\\ &k_z = \frac{2\pi n}{L}, -\frac{N}{2} \leq n < \frac{N}{2}\\ &\overrightarrow k = (k_x, k_z)\\ &k = |\overrightarrow k| \\ \\ 另外: &\tilde{h_0}(\overrightarrow k) = \frac{1}{\sqrt 2} (\xi_r + \xi_i) \sqrt{P_h(\overrightarrow k)} \\ &P_h(\overrightarrow k) = A\frac{e^{-\frac{g^2}{k^2V^4}}}{k^4} |\overrightarrow k \cdot \overrightarrow \omega| ^ 2 \\ \overrightarrow \omega:&风向 \\ V:&风速 \\ \end{aligned} \]
可以看到,这个函数巨复杂无比,不过好在我们也是可以让它跑在GPU上的。
HLSL实现
得到了波浪的频谱,就可以动手开始实现了,HLSL实现如下:
#pragma kernel iFFT2x
#pragma kernel iFFT2y
#pragma kernel GenerateSpectrum
#pragma kernel GenerateTwinRandomGaussian
#pragma kernel GenerateSpectrumStepOne
#pragma kernel GenerateSpectrumStepTwo
static const uint FFT_STAGES = 8;
static const uint FFT_DIMENSION = 1 << FFT_STAGES;
static const uint FFT_BUTTERFLYS = FFT_DIMENSION >> 1;
static const float PI = 3.14159265;
groupshared float2 pingPongArray[FFT_DIMENSION * 2];
uint ReverseBits(uint index, uint count) {
return reversebits(index) >> (32 - count);
}
float2 ComplexMultiply(float2 a, float2 b) {
return float2(a.x * b.x - a.y * b.y, a.y * b.x + a.x * b.y);
}
void ButterFlyOnce(float2 input0, float2 input1, float2 twiddleFactor, out float2 output0, out float2 output1) {
float2 t = ComplexMultiply(twiddleFactor, input1);
output0 = input0 + t;
output1 = input0 - t;
}
float2 Euler(float theta) {
float2 ret;
sincos(theta, ret.y, ret.x);
return ret;
}
Texture2D SrcTex;
RWTexture2D DstTex;
void iFFT2(uint2 id, bool horizontal)
{
uint butterFlyID = horizontal ? id.x : id.y;
uint index0 = butterFlyID * 2;
uint index1 = butterFlyID * 2 + 1;
if (horizontal) {
pingPongArray[index0] = SrcTex[uint2(ReverseBits(index0, FFT_STAGES), id.y)];
pingPongArray[index1] = SrcTex[uint2(ReverseBits(index1, FFT_STAGES), id.y)];
} else {
pingPongArray[index0] = SrcTex[uint2(id.x, ReverseBits(index0, FFT_STAGES))];
pingPongArray[index1] = SrcTex[uint2(id.x, ReverseBits(index1, FFT_STAGES))];
}
uint2 offset = uint2(0, FFT_DIMENSION);
[unroll]
for (uint s = 1; s <= FFT_STAGES; s++) {
GroupMemoryBarrierWithGroupSync();
// 每个stage中独立的FFT的宽度
uint m = 1 << s;
uint halfWidth = m >> 1;
// 属于第几个iFFT
uint nFFT = butterFlyID / halfWidth;
// 在iFFT中属于第几个输入
uint k = butterFlyID % halfWidth;
index0 = k + nFFT * m;
index1 = index0 + halfWidth;
if (s != FFT_STAGES) {
ButterFlyOnce(
pingPongArray[offset.x + index0], pingPongArray[offset.x + index1],
Euler(2 * PI * k / m),
pingPongArray[offset.y + index0], pingPongArray[offset.y + index1]);
offset.xy = offset.yx;
} else {
float2 output0;
float2 output1;
ButterFlyOnce(
pingPongArray[offset.x + index0], pingPongArray[offset.x + index1],
Euler(2 * PI * k / m),
output0, output1);
output0 /= FFT_DIMENSION;
output1 /= FFT_DIMENSION;
if (horizontal) {
DstTex[uint2(index0, id.y)] = output0;
DstTex[uint2(index1, id.y)] = output1;
} else {
output0 = (id.x + id.y) % 2 == 0 ? output0 : -output0;
output1 = (id.x + id.y) % 2 == 0 ? output1 : -output1;
DstTex[uint2(id.x, index0)] = output0;
DstTex[uint2(id.x, index1)] = output1;
}
}
}
}
[numthreads(FFT_BUTTERFLYS, 1, 1)]
void iFFT2x(uint3 id : SV_DispatchThreadID) {
iFFT2(id.xy, true);
}
[numthreads(1, FFT_BUTTERFLYS, 1)]
void iFFT2y(uint3 id : SV_DispatchThreadID) {
iFFT2(id.xy, false);
}
const static float G = 9.8;
float Pow2(float x) { return x * x; }
float Pow4(float x) { return x * x * x * x; }
float2 H0(float2 k_v, float k, float2 w, float V, float2 xi, float sqrtA) {
// P_h(\overrightarrow k) = A\frac{e^{-1/(kL)^2}}{k^4} |\overrightarrow k \cdot \overrightarrow \omega| ^ 2
// L = \frac{V^2}{g}
float sqrtPh = sqrtA * exp(-Pow2(G / (k * V * V)) / 2) * abs(dot(k_v, w)) / Pow2(k);
return xi * sqrtPh / sqrt(2);
}
float2 PhillipsSpectrum(
float2 k_v,
float sqrtA, // 常数
float t, // 时间
float2 w, // 风向
float V, // 风速
float2 xi) // 随机数
{
// 这部分可以预计算
float k = length(k_v);
float2 h0 = H0(k_v, k, w, V, xi, sqrtA);
float2 h0_adj = H0(-k_v, k, w, V, xi, sqrtA) * float2(1, -1);
// \tilde{h_0}(\overrightarrow k) e^{it\sqrt{gk}} + \tilde{ h_0^* }(-\overrightarrow k) e^ { -it\sqrt{ gk } }
half tsqrtGk = t * sqrt(G * k);
float2 h = ComplexMultiply(h0, Euler(tsqrtGk)) + ComplexMultiply(h0_adj, Euler(-tsqrtGk));
h.x = isnan(h.x) ? 0 : h.x;
h.y = isnan(h.y) ? 0 : h.y;
return h;
}
float RadicalInverse_VdC(uint bits) {
bits = (bits << 16u) | (bits >> 16u);
bits = ((bits & 0x55555555u) << 1u) | ((bits & 0xAAAAAAAAu) >> 1u);
bits = ((bits & 0x33333333u) << 2u) | ((bits & 0xCCCCCCCCu) >> 2u);
bits = ((bits & 0x0F0F0F0Fu) << 4u) | ((bits & 0xF0F0F0F0u) >> 4u);
bits = ((bits & 0x00FF00FFu) << 8u) | ((bits & 0xFF00FF00u) >> 8u);
return bits * 2.3283064365386963e-10f;
}
RWTexture2D SpectrumTex;
float _Time;
float _SqrtAmplitude;
float2 _WindDirection;
float _WindSpeed;
float _PatchLength;
[numthreads(4, 4, 1)]
void GenerateSpectrum(uint3 id : SV_DispatchThreadID) {
float2 rand = float2(RadicalInverse_VdC(id.x + 1), RadicalInverse_VdC(id.y + 1));
float2 xi;
sincos(2 * PI * rand.y, xi.x, xi.y);
xi *= sqrt(-2 * log(rand.x));
float2 mn = (float2)id.xy - FFT_DIMENSION / 2;
float L = _PatchLength;
float2 k_v = 2 * PI * mn / L;
SpectrumTex[id.xy] =
PhillipsSpectrum(k_v, _SqrtAmplitude, _Time, _WindDirection, _WindSpeed, xi);
}
RWTexture2D TwinRandomGaussianTexOutput;
[numthreads(4,4,1)]
void GenerateTwinRandomGaussian(uint3 id : SV_DispatchThreadID) {
float2 rand = float2(RadicalInverse_VdC(id.x + 1), RadicalInverse_VdC(id.y + 1));
float2 xi;
sincos(2 * PI * rand.y, xi.x, xi.y);
xi *= sqrt(-2 * log(rand.x));
TwinRandomGaussianTexOutput[id.xy] = xi;
}
Texture2D TwinRandomGaussianTexInput;
RWTexture2D SpectrumStepOneOutput;
[numthreads(4,4,1)]
void GenerateSpectrumStepOne(uint3 id : SV_DispatchThreadID) {
float sqrtA = _SqrtAmplitude;
float2 w = _WindDirection;
float V = _WindSpeed;
float2 xi = TwinRandomGaussianTexInput[id.xy];
float2 mn = (float2)id.xy - FFT_DIMENSION / 2;
float L = _PatchLength;
float2 k_v = 2 * PI * mn / L;
float k = length(k_v);
float2 h0 = H0(k_v, k, w, V, xi, sqrtA);
float2 h0_adj = H0(-k_v, k, w, V, xi, sqrtA) * float2(1, -1);
SpectrumStepOneOutput[id.xy] = float4(h0, h0_adj);
}
Texture2D SpectrumStepOneInput;
RWTexture2D SpectrumStepTwoOutput;
[numthreads(4, 4, 1)]
void GenerateSpectrumStepTwo(uint3 id : SV_DispatchThreadID) {
float4 input = SpectrumStepOneInput[id.xy];
float2 h0 = input.xy;
float2 h0_adj = input.zw;
float t = _Time;
float2 mn = (float2)id.xy - FFT_DIMENSION / 2;
float L = _PatchLength;
float2 k_v = 2 * PI * mn / L;
float k = length(k_v);
half tsqrtGk = t * sqrt(G * k);
float2 h = ComplexMultiply(h0, Euler(tsqrtGk)) + ComplexMultiply(h0_adj, Euler(-tsqrtGk));
h.x = isnan(h.x) ? 0 : h.x;
h.y = isnan(h.y) ? 0 : h.y;
SpectrumStepTwoOutput[id.xy] = h;
}
有几点需要注意下:
- 原本是想利用Hammersley Sequence做GPU上的随机数生成,但是这样发现还是出现了一些重复的pattern,于是还是放在CPU端去做。
- 生成Phillips Spectrum可以抽出时间参数t,预计算一部分(GenerateSpectrumStepOne),减少每帧需要做的计算量。
参考资料
- TESSENDORF, J., 2001. Simulating ocean waters. In SIGGRAPH course notes (course 47), ACM SIGGRAPH
- https://zhuanlan.zhihu.com/p/64414956
- Fynn-Jorin Flügge, Realtime GPGPU FFT Ocean Water Simulation