关于PBO,找了很多资料,然而google了很久,大部分的PBO资料都和这个类似https://blog.csdn.net/panda1234lee/article/details/51546502 。在上传texture的过程中,我分别试了1、2、3个PBO来进行上传,然而效率并没有增加,反而下低了,有点无法理解。然而在读取数据时候,使用两个PBO是可以提高效率,所以总的来说还是有一定研究价值的。
我这里模仿了一下不断采集纹理,我每次用同一张图片去完整更新纹理,就好像每次都采集了一张图片。然后使用FBO进行中间处理,读取数据后复制出来到一个数组。总的来说就是模拟摄像头采集然后编码的流程吧。主要是为了避免过多的逻辑尽量减少额外的功能点。
首先先生成两个PBO,并且分配空间,这里我还生成了三个上传纹理的PBO,但是没有使用了,因为上传纹理没办法通过PBO提高效率,有知道的大佬可以告诉一下原因。
if (init) {
init= false;
uploadPobs = new GLuint[3];
downloadPbos = new GLuint[2];
glGenBuffers(3, uploadPobs);
glGenBuffers(2, downloadPbos);
int align = 128;
int width = curPicWidth;
int height = curPicHeight;
mPboSize = ((width * 4 + (align - 1)) & ~(align - 1)) * height;
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, uploadPobs[0]);
glBufferData(GL_PIXEL_UNPACK_BUFFER, mPboSize, 0, GL_STREAM_DRAW);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, uploadPobs[1]);
glBufferData(GL_PIXEL_UNPACK_BUFFER, mPboSize, 0, GL_STREAM_DRAW);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, uploadPobs[2]);
glBufferData(GL_PIXEL_UNPACK_BUFFER, mPboSize, 0, GL_STREAM_DRAW);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
glBindBuffer(GL_PIXEL_PACK_BUFFER, downloadPbos[0]);
glBufferData(GL_PIXEL_PACK_BUFFER, mPboSize, 0, GL_STATIC_READ);
glBindBuffer(GL_PIXEL_PACK_BUFFER, downloadPbos[1]);
glBufferData(GL_PIXEL_PACK_BUFFER, mPboSize, 0, GL_STATIC_READ);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
接下来就是正常的OpenGL 的纹理绘制,我们将坐标绑定到VAO,然后创建FBO,并且绘制到FBO的纹理上。然后读取数据。最后我们再切换回屏幕,在绘制到屏幕上,这次看到的图形就是上下颠倒的了。之前我也试过不使用FBO直接每次更新单个纹理的整体数据来进行绘制,再使用PBO进行读取,但是无法提高效率。应该更新纹理的时候,发现异步DMA没有完成会等待,所以会导致纹理更新时间变长。所以还是需要使用FBO。
void PboRender::render() {
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glDisable(GL_DITHER);
glEnable(GL_CULL_FACE);
glEnable(GL_DEPTH_TEST);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glViewport(0, 0, curPicWidth, curPicHeight);
glUseProgram(program);
// resetTexture();
glBindFramebuffer(GL_FRAMEBUFFER, frame);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texture);
glUniform1i(textureLocation, 0);
glBindVertexArray(VAO[0]);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
readPixels();
glBindFramebuffer(GL_FRAMEBUFFER, 0);
glViewport(0, 0, _backingWidth, _backingHeight);
glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
glBindTexture(GL_TEXTURE_2D, textureFrame);
glBindVertexArray(VAO[0]);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
}
接着下面就是读取数据,同样模拟一下,功能就是读取数据之后复制到另一个数组,在这个过程中PBO真的是个大坑。同样的代码在不同的设备上性能差距太大了。提一下我发现的点吧。
externalNativeBuild {
cmake {
cppFlags "-std=c++11 -frtti -fexceptions"
arguments '-DANDROID_ARM_NEON=TRUE'
}
}
复制数据的方法
void my_copy(volatile unsigned char *dst, volatile unsigned char *src, int sz)
{
if (sz & 63) {
sz = (sz & -64) + 64;
}
asm volatile (
"NEONCopyPLD: \n"
" VLDM %[src]!,{d0-d7} \n"
" VSTM %[dst]!,{d0-d7} \n"
" SUBS %[sz],%[sz],#0x40 \n"
" BGT NEONCopyPLD \n"
: [dst]"+r"(dst), [src]"+r"(src), [sz]"+r"(sz) : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}
读取数据方法
void PboRender::readPixels() {
int size = mPboSize;
char path[40];
sprintf(path, "/mnt/sdcard/pixel/readPixel%d.rgba", picCount);
// picCount--;
if (picCount <= 0) {
return;
}
if (!cachePixel) {
cachePixel = new byte[size];
memset(cachePixel, 0, size);
}
// if (access("/mnt/sdcard/pixel", 0)) {
// mkdir("/mnt/sdcard/pixel", S_IRUSR | S_IWUSR | S_IXUSR | S_IRWXG | S_IRWXO);
// }
// FILE *file = fopen(path, "wb");
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
long long curTime = getCurrentTime();
if (downloadPboType == NONE) {
long long cc = getCurrentTime();
byte *pixel = new byte[size];
glReadPixels(0, 0, curPicWidth, curPicHeight, GL_RGBA, GL_UNSIGNED_BYTE, pixel);
LOGE("glReadPixels Time %lld", getCurrentTime() - cc);
cc = getCurrentTime();
// fwrite(pixel, size, 1, file);
memcpy(cachePixel, pixel, size);
LOGE("内存复制耗时 %lld", getCurrentTime() - cc);
delete[] pixel;
} else if (downloadPboType == ONE) {
} else if (downloadPboType == TWO) {
index = (index + 1) % 2;
nextIndex = (index + 1) % 2;
glBindBuffer(GL_PIXEL_PACK_BUFFER, downloadPbos[index]);
long long cc = getCurrentTime();
glReadPixels(0, 0, curPicWidth, curPicHeight, GL_RGBA, GL_UNSIGNED_BYTE, 0);
LOGE("glReadPixels Time %lld", getCurrentTime() - cc);
if (readPixInit) {
readPixInit = false;
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
return;
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, downloadPbos[nextIndex]);
GLubyte *ptr = static_cast(glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0,
mPboSize,
GL_MAP_READ_BIT));
glUnmapBuffer(GL_PIXEL_PACK_BUFFER); // release pointer to mapping buffer
cc = getCurrentTime();
if (ptr) {
my_copy(cachePixel,ptr,size);
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
// fwrite(ptr, size, 1, file);
LOGE("内存复制耗时 %lld", getCurrentTime() - cc);
}
// fclose(file);
LOGE("完成耗时%lld", getCurrentTime() - curTime);
}
好了,文章就到这里了,不是特别长,代码逻辑也比较简单,但是在性能测试上真的花了不少时间,PBO确实可以在一定条件下大幅提升性能,如果项目需要还是可以尝试一下,当然我们还可以在GPU内把数据转成YUV422,这样读取的时候时间还可以减少一半。总的来说性能应该提升比较大的,就是PBO需要OpenGL ES 3.0,在手机版本上需要适配。
源码