http://www.cs.cornell.edu/~kb/projects/epigpu/
Abstract: The render cache and the edge-and-point image (EPI) are alternative point-based rendering techniques that combine interactive performance with expensive, high quality shading for complex scenes. They use sparse sampling and intelligent reconstruction to enable fast framerates and to decouple shading from the display update.
We present a hybrid CPU/GPU multi-pass system that accelerates these techniques by utilizing programmable graphics processing units (GPUs) to achieve better framerates while freeing the CPU for other uses such as high-quality shading (including global illumination). Because the render cache and EPI differ from the traditional graphics pipeline in interesting ways, we encountered several challenges in using the GPU effectively. We discuss our optimizations to achieve good performance, limitations with the current generation hardware, as well as possibilities for future improvements.
The following diagram shows the data flow on the GPU. Squares on the figure represent textures, rectangles VBOs. The colored boxes are the different shaders used, grouped by the stage in which they are used: Point processing (green), Edge finding and rasterization (blue) and Image filters (red). Click on each one to visualize the corresponding Cg shader source code.
Age & penalize
/* ========================================================================================== Cg Acceleration Research Edgar Velázquez Armendáriz - edgar [at] graphics [dot] cornell [dot] edu ------------------------------------------------------------------------------------------ ageMain.cg Fragment program to age the points and just copy the current imageID. ========================================================================================== */ /** * fp40: # 17 instructions, 1 R-regs, 1 H-regs */ void ageMain(in half2 pos : WPOS, uniform samplerRECT current : TEXUNIT0, out half4 OUT : COLOR) { // STATIC DATA const static half INCREMENT = 1; // [0,255] // Gets the image id and age OUT = texRECT(current, pos); // New points are not penalised if (OUT.r > 0) { // Penalization for points that were not projected to the image // Compares the vectors at once. If all elements of this comparison eq 0, means that // all of them are filled with ones, and that is an invalid imageID. Using DeMorgan: // // if (A) -> Penalize // A = (x eq 0) && (y eq 0) && (z eq 0) // !A = !(x eq 0) || !(y eq 0) || !(z eq 0) // !A = x neq 0 || y neq 0 || z neq 0 <-- This is like the Cg function !any() // !(!A) = A if ( !any(OUT.gba < half3(1, 1, 127/255.0)) ) { OUT.r += 8/255.0; //OUT.r = 0; } // Color change penalty, using the flag if (OUT.a >= 128/255.0) { //OUT.r += 28/255.0; OUT.r = 1; } } // Finally age the point. Because this is an RGBA8 buffer, values are saturated for free OUT.r += INCREMENT/255.0; }
Point projection
void ageMain(in half2 pos : WPOS,
uniform samplerRECT current : TEXUNIT0,
out half4 OUT : COLOR) {
// STATIC DATA
const static half INCREMENT = 1; // [0,255]
// Gets the image id and age
OUT = texRECT(current, pos);
// New points are not penalised
if (OUT.r > 0) {
// Penalization for points that were not projected to the image
// Compares the vectors at once. If all elements of this comparison eq 0, means that
// all of them are filled with ones, and that is an invalid imageID. Using DeMorgan:
//
// if (A) -> Penalize
// A = (x eq 0) && (y eq 0) && (z eq 0)
// !A = !(x eq 0) || !(y eq 0) || !(z eq 0)
// !A = x neq 0 || y neq 0 || z neq 0 <-- This is like the Cg function !any()
// !(!A) = A
if ( !any(OUT.gba < half3(1, 1, 127/255.0)) ) {
OUT.r += 8/255.0;
//OUT.r = 0;
}
// Color change penalty, using the flag
if (OUT.a >= 128/255.0) {
//OUT.r += 28/255.0;
OUT.r = 1;
}
}
// Finally age the point. Because this is an RGBA8 buffer, values are saturated for free
OUT.r += INCREMENT/255.0;
}
/* Vertex information to be transfered */
struct vertexInfo {
float4 pos : POSITION;
half2 uv : TEXCOORD0;
half4 color : COLOR0;
half4 colorSec : COLOR1;
half2 subPixel;
};
// vp40: # 18 instructions, 2 R-regs
void vertMain( uniform float4x4 ModelViewProj : state.matrix.mvp,
uniform half2 c, // Vector <width/2, height/2> for subpixel transformation
in vertexInfo IN,
out vertexInfo OUT ) {
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, IN.pos);
// Force points into clipping plane, to avoid the backgroud points to be
// deleted by the depth cull
OUT.pos.z = clamp(OUT.pos.z, -1e38, OUT.pos.w * (0.999999523162841796875));
// Just copy the input color, and the secondary color which contains the encoded pointID
OUT.color = IN.color;
OUT.colorSec = IN.colorSec;
// The rectangle texture coordinates encode the index of the vertex into the
// original array of data, in such a way that all its original data is read from
// the packed texture
OUT.uv = IN.uv;
// Get the pixel mapping, integer and fractional part (characteristic and mantissa)
const half2 gamma = c * OUT.pos.xy / OUT.pos.w + c;
// The fractional part is stored in a varying parameter:
// Each mantissa will give me the information about subpixel location
OUT.subPixel = frac(gamma);
OUT.subPixel.y = 1 - OUT.subPixel.y; // Calculated with origin on bottom left corner, It must
// be in the upper left corner.
}
// fp40: # 15 instructions, 1 R-regs, 1 H-regs
void fragMain( uniform samplerRECT packData : TEXUNIT0,
in vertexInfo IN,
in float3 pos : WPOS,
out float depth : DEPTH,
out half4 outputs[3] : COLOR0 ) {
// UPDATE TO REFLECT CHANGES IN ENVIRONMENT
const half MAX_AGE = 255; // [0,255]
const half FLAGS = 64; // [0,255] // FLAGS == 0x40
// After the texture operation, do some math to cover the latency
half4 idAgePacked = texRECT(packData, IN.uv);
// Subpixel info
half2 subPixV = floor(half2(4,4) * IN.subPixel);
half subPix = 1/255.0h * (4*subPixV.y + subPixV.x); // subPix (not scaled) is in the range [0, 15]
// The secondary color contains the pointID, with the LSB in R and the MSB in B, so I just
// need to copy that information
half4 idVertex = half4(FLAGS/255.0, IN.colorSec.rgb);
// If this is an invalid point, because of the age, discard
if (idAgePacked.r >= MAX_AGE/255.0h) { outputs[0] = half4(1,0,0, subPix); }//discard; }
// Normal color, it also copies the subpixel info into the final RGBA texture,
// ready to be attached to the EPI-GPU.
outputs[0] = half4(IN.color.rgb, subPix);
// The final output contains the Flags in R and the pointID splitted in GBA
outputs[1] = idVertex;
// In the third render buffer, I will store:
// r - Priority
// g - seqnum
// b - flags/seqnum info
// a - 0 as flag -> invalid pixels have a = 1
half priority = saturate(idAgePacked.r/2.0 - 8/255.0); // priority = max(0,age-16) / 2
outputs[2] = half4(priority, IN.color.a, 1/255.0h * FLAGS + IN.color.a, 0);
}
Predicted projection
/**
* Vertex program to just transfor a vertex. This shader is meant
* to work on an environment where depth test is disabled
*
* vp40: # 5 instructions, 1 R-regs
*/
void vertSimple( uniform float4x4 ModelViewProj : state.matrix.mvp,
in float4 IN : POSITION,
out float4 OUT : POSITION )
{
// Transformed position of the vertex into clip coordinates, after transforming!
OUT = mul(ModelViewProj, IN);
OUT.y *= -1.0; // Flips the image on y
OUT.z = 0; // Avoid one DP4 instruction, because I just do not care about Z!
}
/**
* Just draw white pixels!
*
* fp40: # 1 instructions, 0 R-regs, 0 H-regs
*/
half3 fragSimple() : COLOR {
return half3(1,1,1);
}
Set Image ID
/**
* Simple vertex output/input structor
*/
struct vertSimpleData {
float4 pos : POSITION;
half4 col : COLOR;
};
/**
* Fragment program to copy the imageID as color. IMPORTANT: Although the 24 bit image Id
* was passed as RGB color, it has to be written into the GBA channels, because the R
* channel contains the age, so swizzle will be used
*
* fp40: # 1 instructions, 0 R-regs, 0 H-regs
*/
void copyColorFrag( in half4 IN : COLOR,
out half4 OUT : COLOR )
{
OUT.gba = IN.rgb;
}
// This shader receives the PointID encoded in the x,y position, so it has to be transformed
// and also transformed into homogeneous clip space
//
// vp40: # 29 instructions, 2 R-regs (MIMD branching)
// # 23 instructions, 2 R-regs (regular code)
void colorCopyVert( uniform float LEN, // The lenght of the point cloud texture
in vertSimpleData IN,
out vertSimpleData OUT )
{
// The original layout of the pointid_flags that was read as vertexes is
//
// R - flags
// G - LSB of pointid
// B
// A - MSB
// A pixel with no point ID and flags = 0x10 means that no point was mapped there. That
// Translates into an incoming vertex (16,0)
if ( any(IN.pos.xy != float2(16,0)) ) {
// First I reconstruct the index
float2 tmp = float2(1/256.0, 256.0) * IN.pos.xy;
// This is interesting: the data written to the pointid_flags texture was meant to be
// unsigned bytes, the scientific notation of the pointid. However, the vertices are
// interpreted as SIGNED shorts, so any number above 0x7FFF is interpreted as a negative
// number. With the y-part I have no problem, because the range will never get that high
// until I have around 8 million points. But the LSB has lot of this troubles, so to convert
// that byte to the format I need, I just add 0xFF to the integer part of the result, just
// for the negative numbers.
// I am using floor and 256, this is valid because all the number have a flags field, therefore
// the division of IN.pos.x and 256 will always have a fractional part, moving all the results
// one unit behind. This way the instruction count is reduced from 23 to 20 instructions.
float index = floor(tmp.x) + tmp.y + (tmp.x < 0 ? 256 : 0);
// DEBUG!
//index = IN.col.r * 255.0f + IN.col.g*255.0*256.0 + IN.col.b*255.0*256.0*256.0;
// I need the fractional and integer part. I can get that info
// in one Cg instruction. The fractional part is stored in x, and
// the integral part will be in y
float2 intFrac;
intFrac.x = modf(index/LEN, intFrac.y);
// The regions without points will have and index equal to zero. However, in the real
// implementation the points used are 32 and above, so writing trash data to point 0 will not
// be a problem, and it is one less test for this shader
// Calculates the homogeneous xy coordinates
intFrac = (1/LEN - 1.0).xx + float2(2.0, 2/LEN) * intFrac;
// Just copy the position results. z is always 0 and w 1. And by now
// intFrac contains number in the range [-1, 1] for valid values
OUT.pos = float4(intFrac, 0, 1);
// Copies the output color
OUT.col = IN.col;
}
else {
OUT.pos = float4(-2,-2,-2,1);
}
}
Depth Cull
// The output of the depth vertex shader
struct depthVertexInfo {
float4 pos : POSITION;
half4 texCoords[5]; // To hold all the interpolated texture coordinates
};
/**
* Vertex shader for the depth filter that performs the multiple texture
* coordinates interpolation in advance.
*
* vp40: # 15 instructions, 2 R-regs
*/
void depthVertMain( uniform float4x4 ModelViewProj : state.matrix.mvp,
in half2 uv : TEXCOORD0,
in float4 pos : POSITION,
out depthVertexInfo OUT) {
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, pos);
static const half offset = 1.0h; // Using TEXTURE_RECTANGE, coords are not normalized
static const half3 offsetV = half3(offset, 0, -offset);
// Interpolate!
OUT.texCoords[0].xy = uv + offsetV.zx;
OUT.texCoords[0].zw = uv + offsetV.yx;
OUT.texCoords[1].xy = uv + offsetV.xx;
OUT.texCoords[1].zw = uv + offsetV.zy;
OUT.texCoords[2].xy = uv;
OUT.texCoords[2].zw = uv + offsetV.xy;
OUT.texCoords[3].xy = uv + offsetV.zz;
OUT.texCoords[3].zw = uv + offsetV.yz;
OUT.texCoords[4].xy = uv + offsetV.xz;
}
/**
* Fragment shader to perform the depth culling. It uses a custom vertex shader to precalculate
* all the texture coordinates, instead of making uv + half2(offset, -offset), just fetch them.
*
* fp40: # 38 instructions, 2 R-regs, 2 H-regs
*/
void depthMain( uniform samplerRECT depthTex,
uniform float4 zTransform, // The four factors for scaling the projected z-values
in depthVertexInfo IN,
out float depth : DEPTH,
out half4 outputs[3] : COLOR0 ) {
// Now compute the 3x3 depth filter, first getting the average;
float z;
float4 alfa;
float4 beta;
// Instead of making a whole if/else block, I use this small instruction which
// compiles into a shader without real branches. It is faster and reduces
// instruction count by 4 compared with the whole branch version.
//
// --> WITHOUT THIS, the depth is also blurred on the very same pass!
//if (z == 1) { discard; }
// Lookup the values
alfa.x = texRECT( depthTex, IN.texCoords[0].xy ).r; // uv + (-s, s)
alfa.y = texRECT( depthTex, IN.texCoords[0].zw ).r; // uv + ( 0, s)
alfa.z = texRECT( depthTex, IN.texCoords[1].xy ).r; // uv + ( s, s)
alfa.w = texRECT( depthTex, IN.texCoords[1].zw ).r; // uv + (-s, 0)
z = texRECT( depthTex, IN.texCoords[2].xy ).r; // uv
beta.x = texRECT( depthTex, IN.texCoords[2].zw ).r; // uv + ( s, 0)
beta.y = texRECT( depthTex, IN.texCoords[3].xy ).r; // uv + (-s, -s)
beta.z = texRECT( depthTex, IN.texCoords[3].zw ).r; // uv + ( 0, -s)
beta.w = texRECT( depthTex, IN.texCoords[4].xy ).r; // uv + ( s, -s)
// Invalid values have a depth of 1, so perform a trick to get rid of them
half4 alfaF = alfa < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
half4 betaF = beta < 1.0h.xxxx ? 1.0h.xxxx : 0.0h.xxxx;
//half4 alfaF = !step(1.0h.xxxx, alfa);
//half4 betaF = !step(1.0h.xxxx, beta);
// Use those factors to get the propper values
//alfa = (alfaF != 0.0h.xxxx) ? alfa : 0.0f.xxxx;
//beta = (betaF != 0.0h.xxxx) ? beta : 0.0f.xxxx;
alfa *= alfaF;
beta *= betaF;
// To make a fast add of all values, construct a matrix with 4 rows, 4 columns
// The first two rows will have the 8 surrounding depth values, and the other
// two have the element count. This way, all the sums are performed in the
// same operation, and it is faster than make the explicit sums for all 18 values.
float4x4 values = float4x4(alfa, beta, alfaF, betaF);
float4 sumPart = mul(values, 1.0f.xxxx);
// The sum of values will be in x, the number of elements in y
float2 sumCount = float2(z + sumPart.x, 1 + sumPart.z) + sumPart.yw;
// To perform the depth cull, the boundaries of the test must be
// calculated, because the z values read from the depth buffer
// do not map lineary with the model's depth.
float average = sumCount.x / sumCount.y;
// Offset
float2 vecTmp = (average.xx * zTransform.xy) + zTransform.zw;
float boundary = vecTmp.x / vecTmp.y;
// The inferior limit is in boundary.x, the upper limit in boundary.y
if (z > boundary) {
// Clear each buffer to its corresponding clear color. It is
// faster to clear all them to the same color, with a single
// instruction, but this is the logic of the application
outputs[0] = half4(0,0,0,0);
outputs[1] = half4(16/255.0,0,0,0); // Clear with invalid point flag
outputs[2] = half4(0,0,0,0);
//z = 1;
}
else {
//z = boundary;
outputs[0] = 1.0h.xxxx;
outputs[1] = 1.0h.xxxx;
outputs[2] = 1.0h.xxxx;
}
z = boundary;
// Depth test must be enabled for the depth texture to be written.
// In order to erase the previos pixels, and at the same time allow to write
// in the color buffer, the DepthTest function must be GL_ALWAYS.
// This way, I will have written both the color and the depth
// If the depth is not the last thing written, everything gets messed up
depth = z;
}
Silhouette edge finder
// # 25 instructions, 3 R-regs, 2 H-regs
half3 main(half2 uv : WPOS,
uniform float3 eye,
uniform float crease, // Index of the first crease edge
uniform float border, // Index of the first border edge
uniform float totalCount, // Total number of edges to be tested
uniform float texLen, // Len of the texture
uniform samplerRECT sV0,
//uniform samplerRECT sV1,
uniform samplerRECT sN0,
uniform samplerRECT sN1) : COLOR
{
// For each edge, recovers its vertex and normals from floating point textures
float3 v0 = texRECT(sV0, uv).rgb;
//float3 v1 = texRECT(sV1, uv).rgb;
float3 n0 = texRECT(sN0, uv).rgb;
float3 n1 = texRECT(sN1, uv).rgb;
// Calculate the index of the current fragment given its position.
// The values of WPOS are {0.5, 1.5, 2.5, ...}, so this computes
// a linearization of the position
half2 uvAux = uv - half2(0.5, 0.5);
float index = (uvAux.x + uvAux.y*texLen);
// The result color to be written
half3 res = half3(0, 0, 0);
// Calculates the vector from the eye to V0
float3 p = eye - v0;
// I will only calculate the appropiate edges
if (index < totalCount) {
float2 dot01; // x-dot0, y-dot1
dot01.x = dot(p, n0);
dot01.y = dot(p, n1);
const float2 sign01 = sign(dot01);
// Regular and Crease edges require dot1 calculation
if (index < border) {
res = ((index < crease) ? (sign01.x != sign01.y) : // Regular edge test
(sign01.x > 0 || sign01.y > 0)).rrr; // Crease edge test
}
else {
res = (sign01.x > 0).rrr; // Border edge test
}
}
// Finally, return the color
return res;
}
Raster Edges
// vp40: # 40 instructions, 3 R-regs
// orig: # 66 instructions, 4 R-regs
struct VertexOutput {
float4 position : POSITION;
float4 edgeVertices : TEXCOORD1;
};
VertexOutput EdgeRastersVP(float4 position : POSITION,
float3 edgeVO : TEXCOORD0,
const uniform float width,
const uniform float height,
const uniform float4x4 modelViewProjMatrix) {
VertexOutput output;
float2 iVertex0, iVertex1, direction;
float4 tVertex0, tVertex1;
// transform each vertex into homogenous clip-space
tVertex0 = mul(modelViewProjMatrix, position);
tVertex1 = mul(modelViewProjMatrix, float4(edgeVO.xyz, 1.0f));
// transform each vertex into image space
// IMPROVEMENT 2: 1 vectorization, new vectors (41 ins)
float4 iVertexTmp = ((( float4(tVertex0.xy, tVertex1.xy) / float4(tVertex0.ww, tVertex1.ww) ) * 0.5)
+ 0.5f.xxxx) * float4(width, height, width, height);
iVertex0 = iVertexTmp.xy;
iVertex1 = iVertexTmp.zw;
// IMPROVEMENT 1: 2 vectorizations, no new vectors (49 ins)
//iVertex0 = (((tVertex0 / tVertex0.ww) * 0.5) + 0.5f.xx) * float2(width, height);
//iVertex1 = (((tVertex1 / tVertex1.ww) * 0.5) + 0.5f.xx) * float2(width, height);
// ORIGINAL: One by one (66 ins)
//iVertex0.x = ((tVertex0.x / tVertex0.w) * 0.5 + 0.5) * width;
//iVertex0.y = ((tVertex0.y / tVertex0.w) * 0.5 + 0.5) * height;
//iVertex1.x = ((tVertex1.x / tVertex1.w) * 0.5 + 0.5) * width;
//iVertex1.y = ((tVertex1.y / tVertex1.w) * 0.5 + 0.5) * height;
direction = normalize(iVertex1 - iVertex0) * 2;
// these are small edges
bool p = (floor(iVertex0.x) == floor(iVertex1.x) && floor(iVertex0.y) == floor(iVertex1.y));
// transform vertex back to homogenous clip-space
// IMPROVEMENT 3: vectorize (40 ins)
tVertex0.xy = ((iVertex0 - direction) / float2(width, height) - 0.5f.xx) * tVertex0.ww / 0.5f.xx;
// ORIGINAL: 41 ins after improvement 2
//tVertex0.x = ((iVertex0.x - direction.x) / width - 0.5) * tVertex0.w / 0.5;
//tVertex0.y = ((iVertex0.y - direction.y) / height - 0.5) * tVertex0.w / 0.5;
output.position = tVertex0;
//output.position.z -= 0.15*output.position.z;
// cull out small edges
if(p)
output.position.x = -1e38;
// order the edges so that the slope is the same for both vertices of an edge
// (so that it is passed correctly into the fragment program after interpolation)
if(((iVertex0.x == iVertex1.x) && (iVertex0.y < iVertex1.y)) || (iVertex0.x < iVertex1.x))
output.edgeVertices = float4(iVertex0.xy, iVertex1.xy);
else
output.edgeVertices = float4(iVertex1.xy, iVertex0.xy);
return output;
}
// fp40: # 23 instructions, 3 R-regs, 2 H-regs
// orig: # 45 instructions, 2 R-regs, 1 H-regs
half3 EdgeRastersFP( in float3 position : WPOS,
in float4 edgeVertices : TEXCOORD1,
uniform samplerRECT depthImage,
uniform float BIAS) : COLOR0
{
// vertices for the edge
float2 edgeVertex0 = edgeVertices.xy;
float2 edgeVertex1 = edgeVertices.zw;
// find the bounding positions of the pixel
// IMPROVEMENT 1: Vectorize the offsets
// No instruction count change
float4 lrtb = position.xxyy + float4(-0.5, 0.5, -0.5, 0.5); // left, right, top, bottom
float left = lrtb.x;
float right = lrtb.y;
float top = lrtb.z;
float bottom = lrtb.w;
/*
float left = position.x - 0.5;
float right = position.x + 0.5;
float top = position.y - 0.5;
float bottom = position.y + 0.5;
*/
// parametrize the line, to P0 + t * direction
// tLeft, tTop - variables for parametric equations
float2 edgeDirection = edgeVertex1 - edgeVertex0;
// IMPROVEMENT 2: After Improvement 1, vectorize the computation and the test
// Instruction count: from 44 to 23
// parametrize the line, to P0 + t * direction
// tLeft, tTop - variables for parametric equations
float2 intersectionXY;
float2 tLeftTop;
float2 xyLocation;
half2 pXY;
tLeftTop = (lrtb.xw - edgeVertex0) / edgeDirection;
intersectionXY = edgeDirection * tLeftTop.yx + edgeVertex0;
pXY = (!((intersectionXY > lrtb.yw) || (intersectionXY < lrtb.xz) || (tLeftTop.yx < 0.0f.xx) || (tLeftTop.yx > 1.0f.xx)));
xyLocation = exp2(floor((intersectionXY - lrtb.xz) * 7)) * pXY;
return float3(xyLocation, 0) / 255.0;
}
Image filters
Pixel Classify & Point Cull
// # 82 instructions, 3 R-regs, 2 H-regs, no if-code
// # 84 instructions, 2 R-regs, 3 H-regs, if code fp40
// This unified shader performs both the pixel classify and the point cull, using MRT!
// OUT[0] = PixelClass
// OUT[1] = PointCull
//
// Original:
// -PixelClass: # 94 instructions, 3 R-regs, 1 H-regs
// -PointCull: # 4 instructions, 2 R-regs, 0 H-regs
void PixelClassPointCull( in half2 pixelPos : WPOS,
const uniform samplerRECT edgeIntersections,
const uniform samplerRECT subPixelLocations,
const uniform samplerRECT bitExtract,
const uniform samplerRECT emptyOrder,
out half3 OUT[2] : COLOR0)
{
const static float BOTTOM = 0;
const static float RIGHT = 1;
const static float TOP = 2;
const static float LEFT = 3;
// Save all intersections in a single half4 vector
// - Top: x
// - Botton: y
// - Left: z
// - Right: w
half4 Intersections;
// Extract the all 4 intersections around the pixel.
Intersections.xz = texRECT(edgeIntersections, pixelPos).rg; // Top-Left
Intersections.y = texRECT(edgeIntersections, half2(pixelPos.x, pixelPos.y - 1)).r; // Bottom
Intersections.w = texRECT(edgeIntersections, half2(pixelPos.x + 1, pixelPos.y)).g; // Right
const half4 colorSample = texRECT(subPixelLocations, pixelPos);
// Scale by 255 so that the intersections are in [0, 8]. For all intersections, at once
Intersections = round(Intersections * 255.0);
// intersection information
half intersectionCount = dot( step(0.001953125h.xxxx, Intersections), 1.0h.xxxx );
if (intersectionCount == 2) {
// construct a 5 bit mask whos information is given by the following
// the 2 MSB: indicates where left = 3, top = 2, right = 1, bottom = 0
float4 edgeIntersection;
// Top intersection
if(Intersections.x > 0) {
edgeIntersection.z = texRECT(bitExtract, float2(Intersections.x, 1)).r;
edgeIntersection.w = TOP;
//intersectionCount++;
}
// Bottom intersection
if(Intersections.y > 0) {
edgeIntersection.xy = edgeIntersection.zw;
edgeIntersection.z = texRECT(bitExtract, float2(Intersections.y, 1)).r;
edgeIntersection.w = BOTTOM;
//intersectionCount++;
}
// Left intersection
if(Intersections.z > 0) {
edgeIntersection.xy = edgeIntersection.zw;
edgeIntersection.z = texRECT(bitExtract, float2(Intersections.z, 1)).r;
edgeIntersection.w = LEFT;
//intersectionCount++;
}
// Right intersection
if(Intersections.w > 0) {
edgeIntersection.xy = edgeIntersection.zw;
edgeIntersection.z = texRECT(bitExtract, float2(Intersections.w, 1)).r;
edgeIntersection.w = RIGHT;
//intersectionCount++;
}
half3 tColor = float3(0, 0, 255);
//half4 colorSample = texRECT(subPixelLocations, pixelPos);
half2 subPixelMask = round(colorSample.ba * 255);
if(subPixelMask.x == 0)
subPixelMask.y = 16;
// Do edge ordering
//float emptyIndex = edgeIntersection.y * 256 + edgeIntersection.w * 64 +
// edgeIntersection.x * 8 + edgeIntersection.z;
float emptyIndex = dot(edgeIntersection, half4(8,256,1,64));
float t = texRECT(emptyOrder, float2(emptyIndex, subPixelMask.y)).r;
if(t >= 2)
tColor.b = 0;
if(t == 1 || t == 3) {
tColor.rg = edgeIntersection.zw * float2(8,4) + edgeIntersection.xy;
}
else {
tColor.rg = edgeIntersection.xy * float2(8,4) + edgeIntersection.zw;
}
// Pixel class info
OUT[0] = tColor / 255.0;
}
else {
OUT[0] = half3( (intersectionCount > 2 ? half2(1, 15/255.0) : half2(0,0) ),1);
}
// And now writes the point cull
OUT[1] = half3(colorSample.rg, colorSample.b * OUT[0].b);
}
Reachability
struct vertexInfo {
float4 pos : POSITION;
half4 texCoords[8]; // To hold all the interpolated texture coordinates
};
/**
* Vertex shader for the neighbor reach, interpolates coordinates
*
* # 8 instructions, 1 R-regs
*/
void NeighborReachVert(
uniform float4x4 ModelViewProj : state.matrix.mvp,
in half2 uv : TEXCOORD0,
in float4 pos : POSITION,
out vertexInfo OUT)
{
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, pos);
// Using TEXTURE_RECTANGE, coords are not normalized
// Interpolate
OUT.texCoords[0].xy = uv + half2(-1, 0); // -1, 0
OUT.texCoords[0].zw = uv + half2( 1, 0); // 1, 0
OUT.texCoords[1].xy = uv + half2( 0, 1); // 0, 1
}
/**
* Vertex shader for the reachability that performs the multiple texture
* coordinates interpolation in advance.
*
* # 19 instructions, 2 R-regs
*/
void ReachabilityVert(
uniform float4x4 ModelViewProj : state.matrix.mvp,
in half2 uv : TEXCOORD0,
in float4 pos : POSITION,
out vertexInfo OUT)
{
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, pos);
// Using TEXTURE_RECTANGE, coords are not normalized
// Interpolate!
OUT.texCoords[0].xy = uv + half2(-2, 0); // -2, 0
OUT.texCoords[0].zw = uv + half2(-1, 0); // -1, 0
OUT.texCoords[1].xy = uv; // 0, 0
OUT.texCoords[1].zw = uv + half2( 1, 0); // 1, 0
OUT.texCoords[2].xy = uv + half2( 2, 0); // 2, 0
OUT.texCoords[2].zw = uv + half2(-2, 1); // -2, 1
OUT.texCoords[3].xy = uv + half2(-1, 1); // -1, 1
OUT.texCoords[3].zw = uv + half2( 0, 1); // 0, 1
OUT.texCoords[4].xy = uv + half2( 1, 1); // 1, 1
OUT.texCoords[4].zw = uv + half2( 2, 1); // 2, 1
OUT.texCoords[5].xy = uv + half2(-2, 2); // -2, 2
OUT.texCoords[5].zw = uv + half2(-1, 2); // -1, 2
OUT.texCoords[6].xy = uv + half2( 0, 2); // 0, 2
OUT.texCoords[6].zw = uv + half2( 1, 2); // 1, 2
OUT.texCoords[7].xy = uv + half2( 2, 2); // 2, 2
}
/**
* Vertex shader for the reachability copy that performs the multiple texture
* coordinates interpolation in advance.
*
* # 17 instructions, 2 R-regs
*/
void CopyReachabilityVert(
uniform float4x4 ModelViewProj : state.matrix.mvp,
in half2 uv : TEXCOORD0,
in float4 pos : POSITION,
out vertexInfo OUT)
{
// Transformed position of the vertex into clip coordinates
OUT.pos = mul(ModelViewProj, pos);
// Using TEXTURE_RECTANGE, coords are not normalized
// Interpolate!
OUT.texCoords[0].xy = uv + half2(-2,-2); // -2,-2
OUT.texCoords[0].zw = uv + half2(-1,-2); // -1,-2
OUT.texCoords[1].xy = uv + half2( 0,-2); // 0,-2
OUT.texCoords[1].zw = uv + half2( 1,-2); // 1,-2
OUT.texCoords[2].xy = uv + half2( 2,-2); // 2,-2
OUT.texCoords[2].zw = uv + half2(-2,-1); // -2,-1
OUT.texCoords[3].xy = uv + half2(-1,-1); // -1,-1
OUT.texCoords[3].zw = uv + half2( 0,-1); // 0,-1
OUT.texCoords[4].xy = uv + half2( 1,-1); // 1,-1
OUT.texCoords[4].zw = uv + half2( 2,-1); // 2,-1
}
// New: # 19 instructions, 2 R-regs, 2 H-regs
// Original: # 28 instructions, 1 R-regs, 2 H-regs
half4 NeighborReach(in half2 pos : WPOS,
in vertexInfo IN,
const uniform samplerRECT pixelClass,
const uniform samplerRECT neighborTableLR,
const uniform samplerRECT neighborTableRL,
const uniform samplerRECT neighborTableVER) : COLOR {
half4 outColor;
half4 olrb; // origin, left, right, bottom
olrb.x = texRECT(pixelClass, pos).g;
olrb.y = texRECT(pixelClass, IN.texCoords[0].xy).g; // -1, 0
olrb.z = texRECT(pixelClass, IN.texCoords[0].zw).g; // 1, 0
olrb.w = texRECT(pixelClass, IN.texCoords[1].xy).g; // 0, 1
olrb = round(olrb * 255);
outColor.x = texRECT(neighborTableRL, olrb.yx).x; // half2(left, origin)
outColor.y = texRECT(neighborTableLR, olrb.xz).x; // half2(origin, right)
outColor.z = texRECT(neighborTableVER, olrb.wx).x; // half2(bottom, origin)
outColor.w = step(14.5, olrb.x); // origin
return outColor / half4(255.0h.xxx, 1);
}
// New: # 140 instructions, 13 R-regs, 2 H-regs
// Original: # 165 instructions, 10 R-regs, 3 H-regs
half4 Reachability(in half2 pos : WPOS,
in vertexInfo IN,
const uniform samplerRECT neighborTable,
const uniform samplerRECT pixelClass,
const uniform samplerRECT orTable,
const uniform samplerRECT chainTable) : COLOR
{
half4 color = half4(0, 0, 0, 0);
//half reachability[15];
half3 reachability013;
half3 reachability456;
half4 reachability789A;
half4 reachabilityBCDE;
half2 argument;
const half3 nr00 = round(texRECT(neighborTable, IN.texCoords[0].xy).rgb * 255); // -2, 0
const half3 nr01 = round(texRECT(neighborTable, IN.texCoords[0].zw).rgb * 255); // -1, 0
const half4 nr02 = round(texRECT(neighborTable, IN.texCoords[1].xy) * 255); // 0, 0
const half3 nr03 = round(texRECT(neighborTable, IN.texCoords[1].zw).rgb * 255); // 1, 0
const half3 nr04 = round(texRECT(neighborTable, IN.texCoords[2].xy).rgb * 255); // 2, 0
const half3 nr05 = round(texRECT(neighborTable, IN.texCoords[2].zw).rgb * 255); // -2, 1
const half3 nr06 = round(texRECT(neighborTable, IN.texCoords[3].xy).rgb * 255); // -1, 1
const half3 nr07 = round(texRECT(neighborTable, IN.texCoords[3].zw).rgb * 255); // 0, 1
const half3 nr08 = round(texRECT(neighborTable, IN.texCoords[4].xy).rgb * 255); // 1, 1
const half3 nr09 = round(texRECT(neighborTable, IN.texCoords[4].zw).rgb * 255); // 2, 1
const half3 nr10 = round(texRECT(neighborTable, IN.texCoords[5].xy).rgb * 255); // -2, 2
const half3 nr11 = round(texRECT(neighborTable, IN.texCoords[5].zw).rgb * 255); // -1, 2
const half3 nr12 = round(texRECT(neighborTable, IN.texCoords[6].xy).rgb * 255); // 0, 2
const half3 nr13 = round(texRECT(neighborTable, IN.texCoords[6].zw).rgb * 255); // 1, 2
const half3 nr14 = round(texRECT(neighborTable, IN.texCoords[7].xy).rgb * 255); // 2, 2
// ROW 0
reachability013.y = nr02.r;
reachability013.x = texRECT(chainTable, half2(nr02.r, nr01.r)).x;
reachability013.z = nr02.g;
reachability456.x = texRECT(chainTable, half2(nr02.g, nr03.g)).x;
// To mask latency
color.g += dot(step(8.0h.xxx, reachability013), half3(4,8,16));
// ROW 1
reachability789A.x = nr02.b;
argument.x = texRECT(chainTable, half2(nr02.r, nr01.b)).x;
argument.y = texRECT(chainTable, half2(nr02.b, nr07.r)).x;
reachability456.z = texRECT(orTable, argument).x;
argument.x = texRECT(chainTable, half2(reachability013.x, nr00.b)).x;
argument.y = texRECT(chainTable, half2(reachability456.z, nr06.r)).x;
reachability456.y = texRECT(orTable, argument).x;
// To mask latency
color.g += dot(step(8.0h.xxx, reachability456), half3(32,64,128));
argument.x = texRECT(chainTable, half2(nr02.g, nr03.b)).x;
argument.y = texRECT(chainTable, half2(nr02.b, nr07.g)).x;
reachability789A.y = texRECT(orTable, argument).x;
argument.x = texRECT(chainTable, half2(reachability456.x, nr04.b)).x;
argument.y = texRECT(chainTable, half2(reachability789A.y, nr08.g)).x;
reachability789A.z = texRECT(orTable, argument).x;
// ROW 2
reachabilityBCDE.y = texRECT(chainTable, float2(nr02.b, nr07.b)).x;
argument.x = texRECT(chainTable, half2(reachability456.z, nr06.b)).x;
argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.r)).x;
reachabilityBCDE.x = texRECT(orTable, argument).x;
argument.x = texRECT(chainTable, half2(reachability456.y, nr05.b)).x;
argument.y = texRECT(chainTable, half2(reachabilityBCDE.x, nr11.r)).x;
reachability789A.w = texRECT(orTable, argument).x;
// To mask latency
color.b += dot(step(8.0h.xxxx, reachability789A), half4(1,2,4,8));
argument.x = texRECT(chainTable, half2(reachability789A.y, nr08.b)).x;
argument.y = texRECT(chainTable, half2(reachabilityBCDE.y, nr12.g)).x;
reachabilityBCDE.z = texRECT(orTable, argument).x;
argument.x = texRECT(chainTable, half2(reachability789A.z, nr09.b)).x;
argument.y = texRECT(chainTable, half2(reachabilityBCDE.z, nr13.g)).x;
reachabilityBCDE.w = texRECT(orTable, argument).x;
// To mask latency
color.b += dot(step(8.0h.xxxx, reachabilityBCDE), half4(16,32,64,128));
color.a = nr02.a;
return color / 255.0;
}
// New: # 66 instructions, 2 R-regs, 4 H-regs
// # 70 instructions, 2 R-regs, 5 H-regs - with if branch
// Original: # 114 instructions, 3 R-regs, 4 H-regs
half3 CopyReachability(in half2 pos : WPOS,
in vertexInfo IN,
const uniform samplerRECT pixelClass,
const uniform samplerRECT reachability) : COLOR
{
//half4 neighbor;
half4 outColor = round(texRECT(reachability, pos) * 255);
// Paralelize
// First block
half4 neighborA;
neighborA.x = texRECT(reachability, IN.texCoords[0].xy).b;
neighborA.y = texRECT(reachability, IN.texCoords[0].zw).b;
neighborA.z = texRECT(reachability, IN.texCoords[1].xy).b;
neighborA.w = texRECT(reachability, IN.texCoords[1].zw).b;
// Multiply and round
neighborA = round(neighborA * 255);
// First fmod operations
neighborA.yzw = fmod(neighborA.yzw, half3(128, 128, 32));
// One extra fmod
neighborA.z = fmod(neighborA.z, 64);
// Values at once
neighborA = step(half4(128, 64, 32, 16), neighborA);
outColor.r += dot(neighborA, half4(1,2,4,8));
// Second block
half4 neighborB;
neighborB.x = texRECT(reachability, IN.texCoords[2].xy).b;
neighborB.y = texRECT(reachability, IN.texCoords[2].zw).b;
neighborB.z = texRECT(reachability, IN.texCoords[3].xy).b;
neighborB.w = texRECT(reachability, IN.texCoords[3].zw).b;
// Multiply and round
neighborB = round(neighborB * 255);
// Fmod operations
neighborB = fmod(neighborB, half4(16, 8, 4, 2));
// Values at once
neighborB = step(half4(8, 4, 2, 0.9), neighborB);
outColor.r += dot(neighborB, half4(16,32,64,128));
// (round(tex*255)) < 128 : 0 ? 1
half4 neighbor;
neighbor = round(texRECT(reachability, IN.texCoords[4].xy) * 255); // 1,-1
outColor.g += step(128, neighbor.g);
// (round(tex*255)) mod 128 < 64 ? 0 : 2
neighbor = round(texRECT(reachability, IN.texCoords[0].zw) * 255); // 2,-1
neighbor.g -= 128 * step(128, neighbor.g);
outColor.g += step(64, neighbor.g) * 2;
if(outColor.a > 0) {
outColor.rgb = half3(255, 255, 255);
}
return outColor.rgb / 255.0;
}
Interpolation
// This version: # 214 instructions, 25 R-regs, 9 H-regs
// Original: # 325 instructions, 2 R-regs, 9 H-regs
// Time: 6.3 ms
half4 Interpolation5(in half2 pos : WPOS,
const uniform samplerRECT reachability,
const uniform samplerRECT colorImage,
const uniform samplerRECT prioritySeqnum,
const uniform samplerRECT priorityTable) : COLOR
{
half modulator, centerModulator, weight = 0;
half3 selfColor;
half3 averageColor = float3(0, 0, 0);
half3 reach = round(texRECT(reachability, pos).rgb * 255);
half3 color = float3(0, 0, 0);
half3 rNeighbors[8];
half3 gNeighbors[8];
half3 bNeighbors[8];
half3 rWeights[8];
half3 gWeights[8];
half3 bWeights[8];
rWeights[0] = 1h;
rWeights[1] = 1h;
rWeights[2] = 1h;
rWeights[3] = 1h;
rWeights[4] = 1h;
rWeights[5] = 1h;
rWeights[6] = 4h;
rWeights[7] = 8h;
gWeights[0] = 4h;
gWeights[1] = 1h;
gWeights[2] = 1h;
gWeights[3] = 8h;
// 8h;
gWeights[4] = 8h;
gWeights[5] = 1h;
gWeights[6] = 1h;
gWeights[7] = 4h;
bWeights[0] = 8h;
bWeights[1] = 4h;
bWeights[2] = 1h;
bWeights[3] = 1h;
bWeights[4] = 1h;
bWeights[5] = 1h;
bWeights[6] = 1h;
bWeights[7] = 1h;
rNeighbors[0] = texRECT(colorImage, pos.xy + half2(-2, -2)).rgb;
rNeighbors[1] = texRECT(colorImage, pos.xy + half2(-1, -2)).rgb;
rNeighbors[2] = texRECT(colorImage, pos.xy + half2( 0, -2)).rgb;
rNeighbors[3] = texRECT(colorImage, pos.xy + half2( 1, -2)).rgb;
rNeighbors[4] = texRECT(colorImage, pos.xy + half2( 2, -2)).rgb;
rNeighbors[5] = texRECT(colorImage, pos.xy + half2(-2, -1)).rgb;
rNeighbors[6] = texRECT(colorImage, pos.xy + half2(-1, -1)).rgb;
rNeighbors[7] = texRECT(colorImage, pos.xy + half2( 0, -1)).rgb;
gNeighbors[0] = texRECT(colorImage, pos.xy + half2( 1, -1)).rgb;
gNeighbors[1] = texRECT(colorImage, pos.xy + half2( 2, -1)).rgb;
gNeighbors[2] = texRECT(colorImage, pos.xy + half2( 2, 0)).rgb;
gNeighbors[3] = texRECT(colorImage, pos.xy + half2(-1, 0)).rgb;
gNeighbors[4] = texRECT(colorImage, pos.xy + half2( 1, 0)).rgb;
gNeighbors[5] = texRECT(colorImage, pos.xy + half2( 2, 0)).rgb;
gNeighbors[6] = texRECT(colorImage, pos.xy + half2(-2, 1)).rgb;
gNeighbors[7] = texRECT(colorImage, pos.xy + half2(-1, 1)).rgb;
bNeighbors[0] = texRECT(colorImage, pos.xy + half2( 0, 1)).rgb;
bNeighbors[1] = texRECT(colorImage, pos.xy + half2( 1, 1)).rgb;
bNeighbors[2] = texRECT(colorImage, pos.xy + half2( 2, 1)).rgb;
bNeighbors[3] = texRECT(colorImage, pos.xy + half2(-2, 2)).rgb;
bNeighbors[4] = texRECT(colorImage, pos.xy + half2(-1, 2)).rgb;
bNeighbors[5] = texRECT(colorImage, pos.xy + half2( 0, 2)).rgb;
bNeighbors[6] = texRECT(colorImage, pos.xy + half2( 1, 2)).rgb;
bNeighbors[7] = texRECT(colorImage, pos.xy + half2( 2, 2)).rgb;
// Data for reach.r: 2x4 fmod, 2x4 step operations with those results
const half4 reachRfmod1 = fmod(reach.rrrr, half4(2,4,8,16));
const half4 reachRfmod2 = fmod(reach.rrrr, half4(32,64,128,256));
const half4 reachRstep1 = step(half4(1,2,4,8), reachRfmod1);
const half4 reachRstep2 = step(half4(16,32,64,128), reachRfmod2);
// Data for reach.g: 2x4 fmod, 2x4 step operations with those results
const half4 reachGfmod1 = fmod(reach.gggg, half4(2,4,8,16));
const half4 reachGfmod2 = fmod(reach.gggg, half4(32,64,128,256));
const half4 reachGstep1 = step(half4(1,2,4,8), reachGfmod1);
const half4 reachGstep2 = step(half4(16,32,64,128), reachGfmod2);
// Data for reach.b: 2x4 fmod, 2x4 step operations with those results
const half4 reachBfmod1 = fmod(reach.bbbb, half4(2,4,8,16));
const half4 reachBfmod2 = fmod(reach.bbbb, half4(32,64,128,256));
const half4 reachBstep1 = step(half4(1,2,4,8), reachBfmod1);
const half4 reachBstep2 = step(half4(16,32,64,128), reachBfmod2);
// Data for the rNeighbors.b
const half4 rNeighborsStep1 = step(0.0001.xxxx,
half4(rNeighbors[0].b, rNeighbors[1].b, rNeighbors[2].b, rNeighbors[3].b));
const half4 rNeighborsStep2 = step(0.0001.xxxx,
half4(rNeighbors[4].b, rNeighbors[5].b, rNeighbors[6].b, rNeighbors[7].b));
// Data for the gNeighbors.b
const half4 gNeighborsStep1 = step(0.0001.xxxx,
half4(gNeighbors[0].b, gNeighbors[1].b, gNeighbors[2].b, gNeighbors[3].b));
const half4 gNeighborsStep2 = step(0.0001.xxxx,
half4(gNeighbors[4].b, gNeighbors[5].b, gNeighbors[6].b, gNeighbors[7].b));
// Data for the bNeighbors.b
const half4 bNeighborsStep1 = step(0.0001.xxxx,
half4(bNeighbors[0].b, bNeighbors[1].b, bNeighbors[2].b, bNeighbors[3].b));
const half4 bNeighborsStep2 = step(0.0001.xxxx,
half4(bNeighbors[4].b, bNeighbors[5].b, bNeighbors[6].b, bNeighbors[7].b));
// R - modulators
const half4 rModulator1 = rNeighborsStep1 * reachRstep1;
const half4 rModulator2 = rNeighborsStep2 * reachRstep2;
// G - modulators
const half4 gModulator1 = gNeighborsStep1 * reachGstep1;
const half4 gModulator2 = gNeighborsStep2 * reachGstep2;
// B - modulators
const half4 bModulator1 = bNeighborsStep1 * reachBstep1;
const half4 bModulator2 = bNeighborsStep2 * reachBstep2;
// ****** ROW 0 ******
modulator = rModulator1.x;
averageColor += modulator * rNeighbors[0] * rWeights[0];
weight += modulator * rWeights[0].x;
modulator = rModulator1.y;
averageColor += modulator * rNeighbors[1] * rWeights[1];
weight += modulator * rWeights[1].x;
modulator = rModulator1.z;
averageColor += modulator * rNeighbors[2] * rWeights[2];
weight += modulator * rWeights[2].x;
modulator = rModulator1.w;
averageColor += modulator * rNeighbors[3] * rWeights[3];
weight += modulator * rWeights[3].x;
modulator = rModulator2.x;
averageColor += modulator * rNeighbors[4] * rWeights[4];
weight += modulator * rWeights[4].x;
// ****** ROW 1 ******
modulator = rModulator2.y;
averageColor += modulator * rNeighbors[5] * rWeights[5];
weight += modulator * rWeights[5].x;
modulator = rModulator2.z;
averageColor += modulator * rNeighbors[6] * rWeights[6];
weight += modulator * rWeights[6].x;
modulator = rModulator2.w;
averageColor += modulator * rNeighbors[7] * rWeights[7];
weight += modulator * rWeights[7].x;
modulator = gModulator1.x;
averageColor += modulator * gNeighbors[0] * gWeights[0];
weight += modulator * gWeights[0].x;
modulator = gModulator1.y;
averageColor += modulator * gNeighbors[1] * gWeights[1];
weight += modulator * gWeights[1].x;
// ****** ROW 2 ******
modulator = gModulator1.z;
averageColor += modulator * gNeighbors[2] * gWeights[2];
weight += modulator * gWeights[2].x;
modulator = gModulator1.w;
averageColor += modulator * gNeighbors[3] * gWeights[3];
weight += modulator * gWeights[3].x;
selfColor = texRECT(colorImage, pos).rgb;
centerModulator = step(0.0001, selfColor.b);
averageColor += centerModulator * selfColor * 32;
weight += centerModulator * 32;
modulator = gModulator2.x;
averageColor += modulator * gNeighbors[4] * gWeights[4];
weight += modulator * gWeights[4].x;
modulator = gModulator2.y;
averageColor += modulator * gNeighbors[5] * gWeights[5];
weight += modulator * gWeights[5].x;
// ****** ROW 3 ******
modulator = gModulator2.z;
averageColor += modulator * gNeighbors[6] * gWeights[6];
weight += modulator * gWeights[6].x;
modulator = gModulator2.w;
averageColor += modulator * gNeighbors[7] * gWeights[7];
weight += modulator * gWeights[7].x;
modulator = bModulator1.x;
averageColor += modulator * bNeighbors[0] * bWeights[0];
weight += modulator * bWeights[0].x;
modulator = bModulator1.y;
averageColor += modulator * bNeighbors[1] * bWeights[1];
weight += modulator * bWeights[1].x;
modulator = bModulator1.z;
averageColor += modulator * bNeighbors[2] * bWeights[2];
weight += modulator * bWeights[2].x;
// ****** ROW 4 ******
modulator = bModulator1.w;
averageColor += modulator * bNeighbors[3] * bWeights[3];
weight += modulator * bWeights[3].x;
modulator = bModulator2.x;
averageColor += modulator * bNeighbors[4] * bWeights[4];
weight += modulator * bWeights[4].x;
modulator = bModulator2.y;
averageColor += modulator * bNeighbors[5] * bWeights[5];
weight += modulator * bWeights[5].x;
modulator = bModulator2.z;
averageColor += modulator * bNeighbors[6] * bWeights[6];
weight += modulator * bWeights[6].x;
modulator = bModulator2.w;
averageColor += modulator * bNeighbors[7] * bWeights[7];
weight += modulator * bWeights[7].x;
// Discards pixels without samples in the 5x5 neighborhood
if (weight < 1) discard;
half4 outColor;
outColor.rgb = averageColor / weight;
outColor.a = saturate(weight / 255.0 + centerModulator);
// Priority calculation
const half pWeight = outColor.a;
half priority;
// If this is an invalid point, get its priority from the table,
// else just get its previously stablished priority value
if (pWeight > 64/255.0) { // The value was already normalized!
priority = texRECT(prioritySeqnum, pos).r;
}
else {
priority = texRECT(priorityTable, half2(pWeight * 255 + 0.5, 0.5)).r;
}
outColor.a = priority;
return outColor;
}
Anti-aliasing
// This: # 23 instructions, 3 R-regs, 1 H-regs
// Orig: # 33 instructions, 3 R-regs, 1 H-regs
half4 AntiAliasing(
in half2 pixelPos : WPOS,
const uniform samplerRECT pixelClass,
const uniform samplerRECT color,
const uniform samplerRECT neighborWeightTable) : COLOR
{
int dx, dy;
half4 selfColor;
half2 edge;
half3 neighborColor;
half2 bitmask;
half neighbor, weight;
// Will not be needed until later, mask latencies
selfColor = texRECT(color, pixelPos);
edge = round(texRECT(pixelClass, pixelPos).rg * half2(255, 255*64));
bitmask = half2(edge.r + edge.g, 0);
half2 neighborWeight = texRECT(neighborWeightTable, bitmask).rg;
neighbor = neighborWeight.r;
weight = neighborWeight.g;
half2 d; // x=dx, y=dy
d.x = modf(neighbor/4.0, d.y);
d = half2(-1, 1) + half2(4, -1) * d;
const half2 neighborCoord = pixelPos + d;
neighborColor = texRECT(color, neighborCoord).rgb;
half4 outColor;
outColor.rgb = lerp(neighborColor, selfColor.rgb, weight);
outColor.a = selfColor.a;
return outColor;
}