1.解析一个体积较大(50MB,约50w行)的数据文件时,遇到了读取、解析花费的时间较长(debug下20+s,release下3s)速度较慢的问题。
2.用UE或者everedit等编辑软件打开这种文件基本上只要0.5s不到的时间。
因此尝试优化代码,提高效率,这里记录下过程。
win10x64 + VS2015
数据文件分为三段:
第一段 230000 250000
第二段开始 1 115.853843 22.534526 -47.04851444
………….
第二段结束 230000 ………………………
第三段开始 1 3 2 96 1 0
………………………..
第三段结束 250000 ………………….
最基本的MFC读数据方式:CStdioFile打开文件-Readstring读取每行数据-查找分隔符-分割出字符串-转化为浮点数-保存到数组里去。
void CGRDFile::OpenStdioFile(CString sFileName)
{
CStdioFile m_file;
m_file.Open(sFileName, CFile::modeRead);
Vertex v;
MeshIndex mesh;
CString smn, sm, sn,sVertex, sN, sX, sY, sZ, sLeft, sMesh, s1Index, s2Index, s3Index;
m_file.ReadString(smn);
smn.Trim();
sm = smn.Left(smn.Find(' '));
sn = smn.Mid(smn.Find(' '), smn.GetLength() - 1);
sn.Trim();
m_grdFile.mn.m = _wtoi(sm);
m_grdFile.mn.n = _wtoi(sn);
//预分配数组大小
m_grdFile.vVertex.reserve(m_grdFile.mn.m);
m_grdFile.vMeshIndex.reserve(m_grdFile.mn.n);
//第二段处理
for (int i = 0; i < m_grdFile.mn.m; i++)
{
m_file.ReadString(sVertex);
sVertex.Trim();
sN = sVertex.Left(sVertex.Find(' '));
sLeft = sVertex.Right(sVertex.GetLength() - sN.GetLength());
sLeft.Trim();
sX = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - sX.GetLength());
sLeft.Trim();
sY = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - sY.GetLength());
sLeft.Trim();
sZ = sLeft;
v.nIndex = _wtoi(sN);
v.x = _wtof(sX);
v.y = _wtof(sY);
v.z = _wtof(sZ);
m_grdFile.vVertex.push_back(v);
//第三段处理类似,不贴了
}
经过测试上面的代码在release下读取完数据耗时2779.03ms在debug下耗时21122ms。
内存文件映射是一个提高大文件读取速度的方法。
void CGRDFile::OpenFile(CString sFileName)
{
HANDLE hFile = CreateFile(
sFileName,
FILE_SHARE_READ ,
0,
0,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL);
if (hFile == INVALID_HANDLE_VALUE)
{
return ;
}
HANDLE hFileMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (hFileMap == NULL)
{
return;
}
// 得到系统分配粒度
SYSTEM_INFO SysInfo;
GetSystemInfo(&SysInfo);
DWORD dwGran = SysInfo.dwAllocationGranularity;
// 得到文件尺寸
DWORD dwFileSizeHigh;
__int64 qwFileSize = GetFileSize(hFile, &dwFileSizeHigh);
qwFileSize |= (((__int64)dwFileSizeHigh) << 32);
// 偏移地址
__int64 qwFileOffset = 0;
// 块大小
DWORD dwBlockBytes = 0;
for (int i = 1; ; ++i)
{
if (qwFileSize <= i * dwGran)
{
dwBlockBytes = i * dwGran;
break;
}
}
if (qwFileOffset >= 0)
{
// 映射视图
//如果在调用MapViewOfFile()的时候,dwNumberOfBytesToMap如果大于文件的size,那么该call就会失败,
//并且error code是access denied。所以MapViewOfFile的最后一个参数不能用dwBlockSize而只能用qwFileSize。
//MapViewOfFile的返回值不能是TCHAR*,因为UNICODE时解释成宽字符,会出错,只能用char*
char *lpbMapAddress = (char *)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, qwFileSize);
if (lpbMapAddress == NULL)
{
return;
}
CString stemp(lpbMapAddress);
CString smn,sm, sn;
smn = stemp.Left(stemp.Find('\n'));
smn.Trim();
sm = smn.Left(smn.Find(' '));
sn = smn.Mid(smn.Find(' '), smn.GetLength() - 1);
sn.Trim();
m_grdFile.mn.m = _wtoi(sm);
m_grdFile.mn.n = _wtoi(sn);
//预分配数组大小
m_grdFile.vVertex.reserve(m_grdFile.mn.m);
m_grdFile.vMeshIndex.reserve(m_grdFile.mn.n);
//顶点数组
Vertex v;
CString sVertex, sN, sX, sY, sZ, sLeft;
int nStart, nEnd;
nStart = stemp.Find('\n');
nEnd = stemp.Find('\n', nStart + 1);
for (int i = 0; i < m_grdFile.mn.m; i++)
{
sVertex = stemp.Mid(nStart+ 1, nEnd - nStart + 1);
sVertex.Trim();
sN = sVertex.Left(sVertex.Find(' '));
sLeft = sVertex.Right(sVertex.GetLength() - sN.GetLength());
sLeft.Trim();
sX = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - sX.GetLength());
sLeft.Trim();
sY = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - sY.GetLength());
sLeft.Trim();
sZ = sLeft;
v.nIndex = _wtoi(sN);
v.x = _wtof(sX);
v.y = _wtof(sY);
v.z = _wtof(sZ);
m_grdFile.vVertex.push_back(v);
nStart = stemp.Find('\n', nEnd);
nEnd = stemp.Find('\n', nStart + 1);
}
//网格数组
CString sMesh, s1Index, s2Index, s3Index;
MeshIndex mesh;
for (int i = 0; i < m_grdFile.mn.n; i++)
{
sMesh = stemp.Mid(nStart + 1, nEnd - nStart + 1);
sMesh.Trim();
sN = sMesh.Left(sMesh.Find(' '));
sLeft = sMesh.Right(sMesh.GetLength() - sN.GetLength());
sLeft.Trim();
s1Index = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - s1Index.GetLength());
sLeft.Trim();
s1Index = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - s1Index.GetLength());
sLeft.Trim();
s2Index = sLeft.Left(sLeft.Find(' '));
sLeft = sLeft.Right(sLeft.GetLength() - s2Index.GetLength());
sLeft.Trim();
s3Index = sLeft.Left(sLeft.Find(' '));
mesh.nIndex = _wtoi(sN);
mesh.nV1Index = _wtoi(s1Index);
mesh.nV2Index = _wtoi(s2Index);
mesh.nV3Index = _wtoi(s3Index);
m_grdFile.vMeshIndex.push_back(mesh);
nStart = stemp.Find('\n', nEnd);
nEnd = stemp.Find('\n', nStart + 1);
}
// 撤销文件映像
UnmapViewOfFile(lpbMapAddress);
// 关闭文件对象
CloseHandle(hFile);
}
}
经过测试上面的代码在release下读取完数据耗时1437.46ms在debug下耗时13738ms。
仔细观察了数据文件,每行占的字节数、每个值占得字节数其实是都是确定的,利用这个特点,可以避免循环里非常复杂耗时的字符串解析过程。
//内存文件映射部分代码不贴了,和上面一样
Vertex v;
MeshIndex mesh;
CString smn, sm, sn, sVertex, sN, sX, sY, sZ, sLeft, sMesh, s1Index, s2Index, s3Index,sblank;
smn = stemp.Left(MN_LINE_SIZE);
smn.Trim();
sm = smn.Mid(0, MN_M_CHAR_SIZE);
sn = smn.Mid(MN_M_CHAR_SIZE, MN_N_CHAR_SIZE);
m_grdFile.mn.m = _wtoi(sm);
m_grdFile.mn.n = _wtoi(sn);
//预分配数组大小
m_grdFile.vVertex.reserve(m_grdFile.mn.m);
m_grdFile.vMeshIndex.reserve(m_grdFile.mn.n);
//顶点数组
for (int i = 0; i < m_grdFile.mn.m; i++)
{
sVertex = stemp.Mid(MN_LINE_SIZE + i * VERTEX_LINE_SIZE, VERTEX_LINE_SIZE);
sN = sVertex.Mid(0, VERTEX_N_CHAR_SIZE);
sX = sVertex.Mid(VERTEX_N_CHAR_SIZE , VERTEX_X_CHAR_SIZE);
sY = sVertex.Mid(VERTEX_N_CHAR_SIZE + VERTEX_X_CHAR_SIZE , VERTEX_Y_CHAR_SIZE);
sZ = sVertex.Mid(VERTEX_N_CHAR_SIZE + VERTEX_X_CHAR_SIZE + VERTEX_Y_CHAR_SIZE, VERTEX_Z_CHAR_SIZE);
v.nIndex = _wtoi(sN);
v.x = _wtof(sX);
v.y = _wtof(sY);
v.z = _wtof(sZ);
m_grdFile.vVertex.push_back(v);
}
//网格数组
for (int i = 0; i < m_grdFile.mn.n; i++)
{
sMesh = stemp.Mid(MN_LINE_SIZE + m_grdFile.mn.m * VERTEX_LINE_SIZE + MESH_LINE_SIZE * i ,
MESH_LINE_SIZE);
sN = sMesh.Mid(0, MESH_N_CHAR_SIZE);
s1Index = sMesh.Mid(MESH_N_CHAR_SIZE + MESH_NODEFINE_CHAR_SIZE , MESH_V1_CHAR_SIZE);
s2Index = sMesh.Mid(MESH_N_CHAR_SIZE + MESH_NODEFINE_CHAR_SIZE + MESH_V1_CHAR_SIZE , MESH_V2_CHAR_SIZE);
s3Index = sMesh.Mid(MESH_N_CHAR_SIZE + MESH_NODEFINE_CHAR_SIZE + MESH_V1_CHAR_SIZE + MESH_V2_CHAR_SIZE ,
MESH_V3_CHAR_SIZE);
mesh.nIndex = _wtoi(sN);
mesh.nV1Index = _wtoi(s1Index);
mesh.nV2Index = _wtoi(s2Index);
mesh.nV3Index = _wtoi(s3Index);
m_grdFile.vMeshIndex.push_back(mesh);
}
经过测试上面的代码在release下读取完数据耗时642ms在debug下耗时4763ms。