6.使用DirecrShow采集摄像头视音频并实时进行H264和AAC编码后封装成MP4

之前一篇说了采集视频并实时进行H264编码,没有讲音频的实时编码,本篇将讲一个比较完整的例子,从DirectShow采集视音频,然后实时进行视音频的编码,最后封装成MP4。视频编码还使用之前用过的X264编码器,音频使用FAAC,这也比较常用的音频编码器,可直接在官网下载源码,最后将编码后的H264和AAC封装成MP4。

本篇将在前一篇(5.使用DirectShow进行摄像头采集并进行H264实时编码)的基础上进行,只要增加音频采集、编码以及最后的合成封装。不过视频的采集有和之前不一样的地方就是之前的编码工作直接在回调中进行的,本篇换了一种更好的方式,在回调中把每次过来的帧数据都放在一个array中保存起来,然后开启了一个处理线程,一边采集,一边编码处理,这样做比较合理一点,避免在回调中编码时花费大量时间而可能导致帧丢失问题。回调代码如下:

HRESULT STDMETHODCALLTYPE CSampleGrabberCB::BufferCB(double SampleTime, BYTE *pBuffer, long BufferLen)
{
	CString str;
	//str.Format(_T("\n BufferCB--lBufferSize:%ld,lWidth:%d,lHeight:%d"), BufferLen, lWidth, lHeight);
	//OutputDebugString(str);

	//也是开始录制
	if (m_bBeginEncode)
	{
		BYTE * pRgbData = new BYTE[BufferLen];
		memcpy(pRgbData, pBuffer, BufferLen);
		GrabDataInfo sData;
		sData.pData = pRgbData;
		sData.nDataSize = BufferLen;
		sData.dSampleTime = SampleTime;

		//把数据先存到array中
		m_mxMsgLog.Lock();
		m_arrGrabDataArr.Add(sData);
		m_mxMsgLog.Unlock();

		str.Format(_T("\n Video--BufferLen:%ld, SampleTime:%f \n"), BufferLen, SampleTime);
		OutputDebugString(str);

		if (m_bFirst)
		{
			m_bFirst = FALSE;

			CString str;
			str.Format(_T("\n Video--SampleTime:%f \n"), SampleTime);
			OutputDebugString(str);
			//开启线程来处理
			AfxBeginThread(VideoDealFunc, this);
		}
	}

	return 0;
}

m_arrGrabDataArr是一个结构,定义如下:

struct GrabDataInfo
{
	BYTE *pData;
	int nDataSize;
	double dSampleTime;

	GrabDataInfo()
	{
		pData = NULL;
		nDataSize = 0;
		dSampleTime = 0.0;
	};
	
	GrabDataInfo(const GrabDataInfo &other)
	{
		*this = other;
	};
	
	GrabDataInfo& operator = (const GrabDataInfo& other)
	{
		pData = other.pData;
		nDataSize = other.nDataSize;
		dSampleTime = other.dSampleTime;
		return *this;
	};
};
typedef CArray  ASGrabDataInfoArray;
然后处理的部分全部放到线程中了,代码如下:

UINT VideoDealFunc(LPVOID lpVoid)
{
	CSampleGrabberCB *pManage = (CSampleGrabberCB*)lpVoid;
	if (pManage)
	{
		pManage->VideoDeal();
	}
	return 0;
}
void CSampleGrabberCB::VideoDeal()
{
	//等待音频正式开始的第一个样本时间有了后再比较
	while (!theApp.m_IsBegin)
	{
		Sleep(200);
	}

	double dSampleTime = theApp.m_nSampleTime;

	m_nFrameIndex = 0;
	int csp = X264_CSP_I420;
	int width = lWidth;
	int height = lHeight;
	int y_size = width * height;
	ULONG nYUVLen = lWidth * lHeight + (lWidth * lHeight)/2;

	USES_CONVERSION;
	string strFullPath = W2A(m_sSavePath);
	m_fp_dst = fopen(strFullPath.c_str(), "wb");

	m_pParam = (x264_param_t*)malloc(sizeof(x264_param_t));

	//初始化,是对不正确的参数进行修改,并对各结构体参数和cabac编码,预测等需要的参数进行初始化
	x264_param_default(m_pParam);

	//如果有编码延迟,可以这样设置就能即时编码
	x264_param_default_preset(m_pParam, "fast", "zerolatency"); 

	m_pParam->i_width = width;
	m_pParam->i_height = height;
	m_pParam->i_csp = X264_CSP_I420;          
	m_pParam->i_fps_num = 30;       // 设置帧率  
	m_pParam->i_fps_den = 1;        // 设置帧率
	//m_pParam->b_repeat_headers = 1;  // 重复SPS/PPS 放到关键帧前面          

	//设置Profile,这里有5种级别(编码出来的码流规格),级别越高,清晰度越高,耗费资源越大
	x264_param_apply_profile(m_pParam, x264_profile_names[1]);

	//x264_picture_t存储压缩编码前的像素数据
	m_pPic_in = (x264_picture_t*)malloc(sizeof(x264_picture_t));
	m_pPic_out = (x264_picture_t*)malloc(sizeof(x264_picture_t));

	x264_picture_init(m_pPic_out);

	//为图像结构体x264_picture_t分配内存
	x264_picture_alloc(m_pPic_in, csp, m_pParam->i_width, m_pParam->i_height);

	//打开编码器
	m_pHandle = x264_encoder_open(m_pParam);
	if(m_pHandle == NULL)//失败就退出
	{
		free(m_pPic_in);
		m_pPic_in = NULL;
		free(m_pPic_out);
		m_pPic_out = NULL;

		free(m_pParam);
		m_pParam = NULL;
		return;	
	}

	while (1)
	{
		DWORD dwRet = WaitForSingleObject(m_hMainExitEvent, 5);  
		if(dwRet == WAIT_OBJECT_0)
		{
			if (m_arrGrabDataArr.GetSize() <= 0)
			{
				break;
			}
		}

		m_mxMsgLog.Lock();
		int nCount = m_arrGrabDataArr.GetSize();
		if(nCount<=0)
		{
			m_mxMsgLog.Unlock();
			continue;
		}

		GrabDataInfo sDataInfo = m_arrGrabDataArr.GetAt(0);
		m_arrGrabDataArr.RemoveAt(0);
		m_mxMsgLog.Unlock();

		if (sDataInfo.dSampleTime < dSampleTime)
		{
			delete[] sDataInfo.pData;
			continue;
		}

		//编码
		//每一帧大小
		BYTE * yuvByte = new BYTE[nYUVLen];
		//先把RGB24转为YUV420
		RGB2YUV(sDataInfo.pData, lWidth, lHeight, yuvByte, &nYUVLen);

		delete[] sDataInfo.pData;

		if (m_pPic_in == NULL || m_pPic_out == NULL || m_pHandle == NULL || m_pParam == NULL)
		{
			continue;
		}

		int iNal = 0;

		//存储压缩编码后的码流数据
		x264_nal_t* pNals = NULL;

		//注意写的起始位置和大小,前y_size是Y的数据,然后y_size/4是U的数据,最后y_size/4是V的数据
		memcpy(m_pPic_in->img.plane[0], yuvByte, y_size);						//先写Y
		memcpy(m_pPic_in->img.plane[1], yuvByte + y_size, y_size/4);			//再写U
		memcpy(m_pPic_in->img.plane[2], yuvByte + y_size + y_size/4, y_size/4); //再写V

		m_pPic_in->i_pts = m_nFrameIndex++; //时钟

		//编码一帧图像,pNals为返回的码流数据,iNal是返回的pNals中的NAL单元的数目
		int ret = x264_encoder_encode(m_pHandle, &pNals, &iNal, m_pPic_in, m_pPic_out);
		if (ret < 0)
		{
			OutputDebugString(_T("\n x264_encoder_encode err"));
			delete[] yuvByte;
			continue;
		}

		//写入目标文件
		for (int j = 0; j < iNal; ++j)
		{
			fwrite(pNals[j].p_payload, 1, pNals[j].i_payload, m_fp_dst);
		}

		delete[] yuvByte; //用完要释放
	}

	int iNal = 0;
	//存储压缩编码后的码流数据
	x264_nal_t* pNals = NULL;

	//把编码器中剩余的码流数据输出
	while (1)
	{
		int ret = x264_encoder_encode(m_pHandle, &pNals, &iNal, NULL, m_pPic_out);
		if (ret == 0)
		{
			break;
		}
		for (int j = 0; j < iNal; ++j)
		{
			fwrite(pNals[j].p_payload, 1, pNals[j].i_payload, m_fp_dst);
		}
	}

	//释放内存
	x264_picture_clean(m_pPic_in);

	//关闭编码器
	x264_encoder_close(m_pHandle);
	m_pHandle = NULL;

	free(m_pPic_in);
	m_pPic_in = NULL;
	free(m_pPic_out);
	m_pPic_out = NULL;

	free(m_pParam);
	m_pParam = NULL;

	//关闭文件
	fclose(m_fp_dst);
	m_fp_dst = NULL;
		
	m_nFrameIndex = 0;
	m_bEndEncode = TRUE;
}

注意编码的参数填写,之前一篇在编码时没有填写帧率,这里加上了,不过帧率要按照采集设备的实际帧率来填写比较好,否则可能与声音同步的时候出现快慢不一致的问题。

另外,有些机器好像帧率不是固定的,会随着曝光值的变化而变化,录出来的视频可能播放就像快进一样。这种情况解决方案很简单,把采集设备的曝光值设置为手动固定值即可。设置代码如下:

//关闭自动曝光
IAMCameraControl *m_pCtrl;
m_pVideoFilter->QueryInterface(IID_IAMCameraControl, (void **)&m_pCtrl );
m_pCtrl->Set(CameraControl_Exposure, -5, CameraControl_Flags_Manual );

以上针对视频的,音频的采集方式跟视频一样,使用回调的方式,采集之前设置好回调,代码如下:

//设置音频抓取数据
		m_pAudioGrabberFilter->QueryInterface(IID_ISampleGrabber, (void **)&m_pAudioGrabber);

		//获取音频采集源的相关参数
		IAMStreamConfig *pAudioConfig = NULL;  
		m_pCapture->FindInterface(&PIN_CATEGORY_CAPTURE, &MEDIATYPE_Audio, 
							m_pAudioFilter, IID_IAMStreamConfig, (void **) &pAudioConfig);

		AM_MEDIA_TYPE *audiPmt = NULL; 
		AUDIO_STREAM_CONFIG_CAPS ascc;
		pAudioConfig->GetStreamCaps(0, &audiPmt, (BYTE*)&ascc);

		WAVEFORMATEX *pVih = (WAVEFORMATEX*)audiPmt->pbFormat;

		m_audioCB.m_nChannels = pVih->nChannels;
		m_audioCB.m_nSamplesPerSec = pVih->nSamplesPerSec;
		m_audioCB.m_wBitsPerSample = pVih->wBitsPerSample;
		
		//audiPmt->cbFormat = sizeof(WAVEFORMATEX);
		//audiPmt->pbFormat = (BYTE*)pVih;
		audiPmt->majortype = MEDIATYPE_Audio;//MEDIATYPE_Video
		audiPmt->subtype = MEDIASUBTYPE_PCM;//MEDIASUBTYPE_RGB24
		audiPmt->formattype = FORMAT_WaveFormatEx;//视频采集时没设置这一项

		pAudioConfig->SetFormat(audiPmt);
		hr = m_pAudioGrabber->SetMediaType(audiPmt);
		if(FAILED(hr))
		{
			AfxMessageBox(_T("Fail to set audio media type!"));
			return;
		}

		m_pAudioGrabber->SetBufferSamples(FALSE); 
		m_pAudioGrabber->SetOneShot(FALSE);
		m_pAudioGrabber->SetCallback(&m_audioCB, 1);

同样,回调中也开启一个线程处理音频的编码,回调代码如下:

HRESULT STDMETHODCALLTYPE CAudioSampleGrabber::BufferCB(double SampleTime, BYTE *pBuffer, long BufferLen)
{
	CString str;
	//str.Format(_T("\n Audio--BufferLen:%ld"), BufferLen);
	//OutputDebugString(str);

	if(m_bBeginEncode)
	{
		BYTE * pcmData = new BYTE[BufferLen];
		memcpy(pcmData, pBuffer, BufferLen);
		GrabDataInfo sData;
		sData.pData = pcmData;
		sData.nDataSize = BufferLen;
		sData.dSampleTime = SampleTime;
		m_arrAudioDataInfo.Add(sData);

		str.Format(_T("\n Audio--BufferLen:%ld, SampleTime:%f \n"), BufferLen, SampleTime);
		OutputDebugString(str);

		if (m_bFirst)
		{
			m_bFirst = FALSE;

			AfxBeginThread(AudioDealFunc, this);
		}
	}

	return 0;
}

音频的编码使用faac,但注意,faac的接口每次处理的数据在通道数和采样率配置好后是固定的,而回调每次返回的数据大小一般都会大于这个固定大小。所以在编码的时候要注意切分好数据,不要直接把每次回调过来的数据做一次性编码,而要做多次编码。具体编码处理如下:

void CAudioSampleGrabber::AudioDeal()
{
	USES_CONVERSION;
	string strFullPath = W2A(m_sSavePath);
	m_fpOut = fopen(strFullPath.c_str(), "wb");

	m_hFaacEncHandle = faacEncOpen(m_nSamplesPerSec, m_nChannels, &m_nInputSamples, &m_nMaxOutputBytes);
	if(m_hFaacEncHandle == NULL)
	{
		OutputDebugString(_T("faacEncOpen failed"));
		return;
	}

	//获取配置
	m_faacConfigurePtr = faacEncGetCurrentConfiguration(m_hFaacEncHandle);
	m_faacConfigurePtr->inputFormat = FAAC_INPUT_16BIT;
	// 0 = Raw,1 = ADTS
	m_faacConfigurePtr->outputFormat = 1;
	m_faacConfigurePtr->aacObjectType = MAIN;
	m_faacConfigurePtr->allowMidside = 0;
	m_faacConfigurePtr->useLfe = 0;
	m_faacConfigurePtr->useTns = 1;

	//设置配置
	int nRet = faacEncSetConfiguration(m_hFaacEncHandle, m_faacConfigurePtr);

	m_bSampleBegin = TRUE;

	while (1)
	{
		DWORD dwRet = WaitForSingleObject(m_hMainExitEvent, 10);  
		if(dwRet == WAIT_OBJECT_0)
		{
			if (m_arrAudioDataInfo.GetSize() <= 0)
			{
				break;
			}
		}
		m_mxMsgLog.Lock();
		int nCount = m_arrAudioDataInfo.GetSize();
		if(nCount<=0)
		{
			m_mxMsgLog.Unlock();
			continue;
		}

		GrabDataInfo sDataInfo = m_arrAudioDataInfo.GetAt(0);
		m_arrAudioDataInfo.RemoveAt(0);
		m_mxMsgLog.Unlock();

		if (m_hFaacEncHandle == NULL || m_faacConfigurePtr == NULL)
		{
			continue;
		}

		//设定每次能编码的数据大小
		int nPCMBufferSize = m_nInputSamples*m_wBitsPerSample / 8;

		BYTE* pbPCMBuffer = new BYTE[nPCMBufferSize];
		BYTE *pbAACBuffer = new BYTE [m_nMaxOutputBytes];

		ULONG ulTotalEncode = 0;
		int nTime = 0;
		while (1)
		{
			//pBuffer大小为BufferLen,远大于编码能力nPCMBufferSize,所以这里多分几次编
			//每次从pBuffer中取出nPCMBufferSize的大小,直到取完
			memcpy(pbPCMBuffer, sDataInfo.pData+ulTotalEncode, nPCMBufferSize);
			ulTotalEncode += nPCMBufferSize;
			nTime++;
			int nRet = faacEncEncode(m_hFaacEncHandle, (int*) pbPCMBuffer, m_nInputSamples, pbAACBuffer, m_nMaxOutputBytes);
			if (nRet <= 0) //faac一般需要几个样本缓存,所以相当于丢弃
			{
				break;
			}

			//记录正式开始编码的第一个样本时间
			if (m_bSampleBegin)
			{
				m_bSampleBegin = FALSE;
				theApp.m_nSampleTime = sDataInfo.dSampleTime; //用来与视频样本同步的时间
				theApp.m_IsBegin = TRUE;
			}

			//写入文件
			fwrite(pbAACBuffer, 1, nRet, m_fpOut);

			//取到最后一次要注意,大小不是nPCMBufferSize了,而是BufferLen - ulTotalEncode
			if (sDataInfo.nDataSize < ulTotalEncode + nPCMBufferSize) 
			{
				int nEndDataSize = sDataInfo.nDataSize - ulTotalEncode;
				if (nEndDataSize > 0) //剩余的
				{
					delete[] pbPCMBuffer;
					pbPCMBuffer = new BYTE[nEndDataSize];
					memcpy(pbPCMBuffer, sDataInfo.pData+ulTotalEncode, nEndDataSize);

					//要修改一下输入采样
					int nInputSamples = nEndDataSize / (m_wBitsPerSample/8);
					//对剩余的数据编码
					nRet = faacEncEncode(m_hFaacEncHandle, (int*)pbPCMBuffer, nInputSamples, pbAACBuffer, m_nMaxOutputBytes);
					if (nRet <= 0)
					{
						break;
					}
					fwrite(pbAACBuffer, 1, nRet, m_fpOut);
				}
				break;
			}
		}
		delete [] pbPCMBuffer;
		delete [] pbAACBuffer;
		delete [] sDataInfo.pData;
	}

	//最后flush一下编码器中的数据
	BYTE *pbAACBuffer = new BYTE [m_nMaxOutputBytes];
	while((	nRet=faacEncEncode(m_hFaacEncHandle, NULL, 0, pbAACBuffer,m_nMaxOutputBytes)) > 0)
	{
		fwrite(pbAACBuffer,1,nRet,m_fpOut);
	}
	delete [] pbAACBuffer;
	faacEncClose(m_hFaacEncHandle);
	fclose(m_fpOut);

	m_bEndEncode = TRUE;
}

接下来说说视音频在采集的同步问题,由于faac编码开始时需要几次的样本做缓存,所以导致前几个样本被废弃,具体哪个样本开始,每次可能都不一样,而视频从第一个样本开始就正常编码,所以视音频这时不一定同步,所以我这里用了一个简单的方法,就是先进行faac编码,等待faac编码正式成功的第一个样本开始,以这个样本的采样时间为基准,拿视频的采样时间与其比较,只有大于等于音频的第一个样本时间,才进行正式的视频的编码,这样基本上能同步,当然,还有更好的办法。

最后是合成封装,将编码后H264和AAC封装成MP4,这个就不多讲了,只要按照MP4的格式规范来写,一般不是问题,当然,最好的方法是用ffmpeg来实现,这里我参考了一下雷霄骅的《最简单的基于FFmpeg的封装格式处理:视音频复用器(muxer)》,雷大神是我非常敬佩的一个程序员,虽然他已离开这个世界,但他的技术博客却永远在这里发着光。具体见下方的工程下载。

最后的界面再次展示一下,

6.使用DirecrShow采集摄像头视音频并实时进行H264和AAC编码后封装成MP4_第1张图片


完整工程下载见这里:工程代码下载







你可能感兴趣的:(DShow)