《Android FFmpeg 播放器开发梳理》第四章 音频重采样与变速变调处理

前面一章,我们讲解了音频输出的处理,这一章将会讲解音频重采样以及变速变调处理。
AudioResampler是音频重采样处理的对象。重采样器主要是用来从音频解码器AudioDecoder中解码得到一帧音频帧,然后根据同步的类型,判断是否需要对其进行重采样处理以及变速变调处理的逻辑。其实现代码如下:

/**
 * 音频参数
 */
typedef struct AudioParams {
    int freq;
    int channels;
    int64_t channel_layout;
    enum AVSampleFormat fmt;
    int frame_size;
    int bytes_per_sec;
} AudioParams;

/**
 * 音频重采样状态结构体
 */
typedef struct AudioState {
    double audioClock;                      // 音频时钟
    double audio_diff_cum;
    double audio_diff_avg_coef;
    double audio_diff_threshold;
    int audio_diff_avg_count;
    int audio_hw_buf_size;
    uint8_t *outputBuffer;                  // 输出缓冲大小
    uint8_t *resampleBuffer;                // 重采样大小
    short *soundTouchBuffer;                // SoundTouch缓冲
    unsigned int bufferSize;                // 缓冲大小
    unsigned int resampleSize;              // 重采样大小
    unsigned int soundTouchBufferSize;      // SoundTouch处理后的缓冲大小大小
    int bufferIndex;
    int writeBufferSize;                    // 写入大小
    SwrContext *swr_ctx;                    // 音频转码上下文
    int64_t audio_callback_time;            // 音频回调时间
    AudioParams audioParamsSrc;             // 音频原始参数
    AudioParams audioParamsTarget;          // 音频目标参数
} AudioState;

/**
 * 音频重采样器
 */
class AudioResampler {
public:
    AudioResampler(PlayerState *playerState, AudioDecoder *audioDecoder, MediaSync *mediaSync);

    virtual ~AudioResampler();

    int setResampleParams(AudioDeviceSpec *spec, int64_t wanted_channel_layout);

    void pcmQueueCallback(uint8_t *stream, int len);

private:
    int audioSynchronize(int nbSamples);

    int audioFrameResample();

private:
    PlayerState *playerState;
    MediaSync *mediaSync;

    AVFrame *frame;
    AudioDecoder *audioDecoder;             // 音频解码器
    AudioState *audioState;                 // 音频重采样状态
    SoundTouchWrapper *soundTouchWrapper;   // 变速变调处理
};


AudioResampler::AudioResampler(PlayerState *playerState, AudioDecoder *audioDecoder, MediaSync *mediaSync) {
    this->playerState = playerState;
    this->audioDecoder = audioDecoder;
    this->mediaSync = mediaSync;
    audioState = (AudioState *) av_mallocz(sizeof(AudioState));
    memset(audioState, 0, sizeof(AudioState));
    soundTouchWrapper = new SoundTouchWrapper();
    frame = av_frame_alloc();
}

AudioResampler::~AudioResampler() {
    playerState = NULL;
    audioDecoder = NULL;
    mediaSync = NULL;
    if (soundTouchWrapper) {
        delete soundTouchWrapper;
        soundTouchWrapper = NULL;
    }
    if (audioState) {
        swr_free(&audioState->swr_ctx);
        av_freep(&audioState->resampleBuffer);
        memset(audioState, 0, sizeof(AudioState));
        av_free(audioState);
        audioState = NULL;
    }
    if (frame) {
        av_frame_unref(frame);
        av_frame_free(&frame);
        frame = NULL;
    }
}

int AudioResampler::setResampleParams(AudioDeviceSpec *spec, int64_t wanted_channel_layout) {

    audioState->audioParamsSrc = audioState->audioParamsTarget;
    audioState->audio_hw_buf_size = spec->size;
    audioState->bufferSize = 0;
    audioState->bufferIndex = 0;
    audioState->audio_diff_avg_coef = exp(log(0.01) / AUDIO_DIFF_AVG_NB);
    audioState->audio_diff_avg_count = 0;
    audioState->audio_diff_threshold = (double) (audioState->audio_hw_buf_size) / audioState->audioParamsTarget.bytes_per_sec;

    audioState->audioParamsTarget.fmt = AV_SAMPLE_FMT_S16;
    audioState->audioParamsTarget.freq = spec->freq;
    audioState->audioParamsTarget.channel_layout = wanted_channel_layout;
    audioState->audioParamsTarget.channels = spec->channels;
    audioState->audioParamsTarget.frame_size = av_samples_get_buffer_size(NULL, audioState->audioParamsTarget.channels, 1,
                                                                          audioState->audioParamsTarget.fmt, 1);
    audioState->audioParamsTarget.bytes_per_sec = av_samples_get_buffer_size(NULL, audioState->audioParamsTarget.channels,
                                                                             audioState->audioParamsTarget.freq,
                                                                             audioState->audioParamsTarget.fmt, 1);

    if (audioState->audioParamsTarget.bytes_per_sec <= 0 || audioState->audioParamsTarget.frame_size <= 0) {
        av_log(NULL, AV_LOG_ERROR, "av_samples_get_buffer_size failed\n");
        return -1;
    }
    return 0;
}

void AudioResampler::pcmQueueCallback(uint8_t *stream, int len) {
    int bufferSize, length;

    // 没有音频解码器时,直接返回
    if (!audioDecoder) {
        memset(stream, 0, len);
        return;
    }

    audioState->audio_callback_time = av_gettime_relative();
    while (len > 0) {
        if (audioState->bufferIndex >= audioState->bufferSize) {
            bufferSize = audioFrameResample();
            if (bufferSize < 0) {
                audioState->outputBuffer = NULL;
                audioState->bufferSize = (unsigned int) (AUDIO_MIN_BUFFER_SIZE / audioState->audioParamsTarget.frame_size
                                                         * audioState->audioParamsTarget.frame_size);
            } else {
                audioState->bufferSize = bufferSize;
            }
            audioState->bufferIndex = 0;
        }

        length = audioState->bufferSize - audioState->bufferIndex;
        if (length > len) {
            length = len;
        }
        // 复制经过转码输出的PCM数据到缓冲区中
        if (audioState->outputBuffer != NULL && !playerState->mute) {
            memcpy(stream, audioState->outputBuffer + audioState->bufferIndex, length);
        } else {
            memset(stream, 0, length);
        }
        len -= length;
        stream += length;
        audioState->bufferIndex += length;
    }
    audioState->writeBufferSize = audioState->bufferSize - audioState->bufferIndex;

    if (!isnan(audioState->audioClock) && mediaSync) {
        mediaSync->updateAudioClock(audioState->audioClock -
                                    (double) (2 * audioState->audio_hw_buf_size + audioState->writeBufferSize)
                                    / audioState->audioParamsTarget.bytes_per_sec,
                                    audioState->audio_callback_time / 1000000.0);
    }
}

int AudioResampler::audioSynchronize(int nbSamples) {
    int wanted_nb_samples = nbSamples;

    // 如果时钟不是同步到音频流,则需要进行对音频频进行同步处理
    if (playerState->syncType != AV_SYNC_AUDIO) {
        double diff, avg_diff;
        int min_nb_samples, max_nb_samples;
        diff = mediaSync ? mediaSync->getAudioDiffClock() : 0;
        if (!isnan(diff) && fabs(diff) < AV_NOSYNC_THRESHOLD) {
            audioState->audio_diff_cum = diff + audioState->audio_diff_avg_coef * audioState->audio_diff_cum;
            if (audioState->audio_diff_avg_count < AUDIO_DIFF_AVG_NB) {
                audioState->audio_diff_avg_count++;
            } else {
                avg_diff = audioState->audio_diff_cum * (1.0 - audioState->audio_diff_avg_coef);

                if (fabs(avg_diff) >= audioState->audio_diff_threshold) {
                    wanted_nb_samples = nbSamples + (int)(diff * audioState->audioParamsSrc.freq);
                    min_nb_samples = ((nbSamples * (100 - SAMPLE_CORRECTION_PERCENT_MAX) / 100));
                    max_nb_samples = ((nbSamples * (100 + SAMPLE_CORRECTION_PERCENT_MAX) / 100));
                    wanted_nb_samples = av_clip(wanted_nb_samples, min_nb_samples, max_nb_samples);
                }
            }
        } else {
            audioState->audio_diff_avg_count = 0;
            audioState->audio_diff_cum = 0;
        }
    }

    return wanted_nb_samples;
}

int AudioResampler::audioFrameResample() {
    int data_size, resampled_data_size;
    int64_t dec_channel_layout;
    int wanted_nb_samples;
    int translate_time = 1;
    int ret = -1;

    // 处于暂停状态
    if (!audioDecoder || playerState->abortRequest || playerState->pauseRequest) {
        return -1;
    }

    for (;;) {

        // 如果数据包解码失败,直接返回
        if ((ret = audioDecoder->getAudioFrame(frame)) < 0) {
            return -1;
        }
        if (ret == 0) {
            continue;
        }

        data_size = av_samples_get_buffer_size(NULL, av_frame_get_channels(frame),
                                               frame->nb_samples,
                                               (AVSampleFormat)frame->format, 1);

        dec_channel_layout =
                (frame->channel_layout && av_frame_get_channels(frame) == av_get_channel_layout_nb_channels(frame->channel_layout))
                ? frame->channel_layout : av_get_default_channel_layout(av_frame_get_channels(frame));
        wanted_nb_samples = audioSynchronize(frame->nb_samples);

        // 帧格式跟源格式不对????
        if (frame->format != audioState->audioParamsSrc.fmt
            || dec_channel_layout != audioState->audioParamsSrc.channel_layout
            || frame->sample_rate != audioState->audioParamsSrc.freq
            || (wanted_nb_samples != frame->nb_samples && !audioState->swr_ctx)) {

            swr_free(&audioState->swr_ctx);
            audioState->swr_ctx = swr_alloc_set_opts(NULL, audioState->audioParamsTarget.channel_layout,
                                                     audioState->audioParamsTarget.fmt, audioState->audioParamsTarget.freq,
                                                     dec_channel_layout, (AVSampleFormat)frame->format,
                                                     frame->sample_rate, 0, NULL);

            if (!audioState->swr_ctx || swr_init(audioState->swr_ctx) < 0) {
                av_log(NULL, AV_LOG_ERROR, "Cannot create sample rate converter for conversion of %d Hz %s %d channels to %d Hz %s %d channels!\n",
                       frame->sample_rate,
                       av_get_sample_fmt_name((AVSampleFormat)frame->format),
                       av_frame_get_channels(frame),
                       audioState->audioParamsTarget.freq,
                       av_get_sample_fmt_name(audioState->audioParamsTarget.fmt),
                       audioState->audioParamsTarget.channels);
                swr_free(&audioState->swr_ctx);
                return -1;
            }
            audioState->audioParamsSrc.channel_layout = dec_channel_layout;
            audioState->audioParamsSrc.channels = av_frame_get_channels(frame);
            audioState->audioParamsSrc.freq = frame->sample_rate;
            audioState->audioParamsSrc.fmt = (AVSampleFormat)frame->format;
        }

        // 音频重采样处理
        if (audioState->swr_ctx) {
            const uint8_t **in = (const uint8_t **)frame->extended_data;
            uint8_t **out = &audioState->resampleBuffer;
            int out_count = (int64_t)wanted_nb_samples * audioState->audioParamsTarget.freq / frame->sample_rate + 256;
            int out_size  = av_samples_get_buffer_size(NULL, audioState->audioParamsTarget.channels, out_count, audioState->audioParamsTarget.fmt, 0);
            int len2;
            if (out_size < 0) {
                av_log(NULL, AV_LOG_ERROR, "av_samples_get_buffer_size() failed\n");
                return -1;
            }
            if (wanted_nb_samples != frame->nb_samples) {
                if (swr_set_compensation(audioState->swr_ctx, (wanted_nb_samples - frame->nb_samples) * audioState->audioParamsTarget.freq / frame->sample_rate,
                                         wanted_nb_samples * audioState->audioParamsTarget.freq / frame->sample_rate) < 0) {
                    av_log(NULL, AV_LOG_ERROR, "swr_set_compensation() failed\n");
                    return -1;
                }
            }
            av_fast_malloc(&audioState->resampleBuffer, &audioState->resampleSize, out_size);
            if (!audioState->resampleBuffer) {
                return AVERROR(ENOMEM);
            }
            len2 = swr_convert(audioState->swr_ctx, out, out_count, in, frame->nb_samples);
            if (len2 < 0) {
                av_log(NULL, AV_LOG_ERROR, "swr_convert() failed\n");
                return -1;
            }
            if (len2 == out_count) {
                av_log(NULL, AV_LOG_WARNING, "audio buffer is probably too small\n");
                if (swr_init(audioState->swr_ctx) < 0) {
                    swr_free(&audioState->swr_ctx);
                }
            }
            audioState->outputBuffer = audioState->resampleBuffer;
            resampled_data_size = len2 * audioState->audioParamsTarget.channels * av_get_bytes_per_sample(audioState->audioParamsTarget.fmt);

            // 变速变调处理
            if ((playerState->playbackRate != 1.0f || playerState->playbackPitch != 1.0f) && !playerState->abortRequest) {
                int bytes_per_sample = av_get_bytes_per_sample(audioState->audioParamsTarget.fmt);
                av_fast_malloc(&audioState->soundTouchBuffer, &audioState->soundTouchBufferSize, out_size * translate_time);
                for (int i = 0; i < (resampled_data_size / 2); i++) {
                    audioState->soundTouchBuffer[i] = (audioState->resampleBuffer[i * 2] | (audioState->resampleBuffer[i * 2 + 1] << 8));
                }
                if (!soundTouchWrapper) {
                    soundTouchWrapper = new SoundTouchWrapper();
                }
                int ret_len = soundTouchWrapper->translate(audioState->soundTouchBuffer, (float)(playerState->playbackRate),
                                                           (float)(playerState->playbackPitch != 1.0f ? playerState->playbackPitch : 1.0f / playerState->playbackRate),
                                                           resampled_data_size / 2, bytes_per_sample,
                                                           audioState->audioParamsTarget.channels, frame->sample_rate);
                if (ret_len > 0) {
                    audioState->outputBuffer = (uint8_t*)audioState->soundTouchBuffer;
                    resampled_data_size = ret_len;
                } else {
                    translate_time++;
                    av_frame_unref(frame);
                    continue;
                }
            }
        } else {
            audioState->outputBuffer = frame->data[0];
            resampled_data_size = data_size;
        }

        // 处理完直接退出循环
        break;
    }

    // 利用pts更新音频时钟
    if (frame->pts != AV_NOPTS_VALUE) {
        audioState->audioClock = frame->pts * av_q2d((AVRational){1, frame->sample_rate})
                                 + (double) frame->nb_samples / frame->sample_rate;
    } else {
        audioState->audioClock = NAN;
    }

    // 使用完成释放引用,防止内存泄漏
    av_frame_unref(frame);

    return resampled_data_size;
}

以上就是处理音频重采样以及变速变调处理的代码。这个代码也没啥好说的,在不是同步到音频时钟的情况下,我们需要根据实际的采样率(sample_rate) 得到目标采样率对应的采样数量(wanted_nb_samples),然后经过音频重采样处理,得到重采样后的缓冲数据,然后做变速变调处理,接着计算出重采样以及变速变调处理后的时长,加上原来的时钟,得到处理后的音频时间戳(pts)。我们通过不断地把音频输出设备回调地PCM缓冲区填满,填满后,我们需要计算出当前的音频时间戳用了多少,通知MediaSync更新音频时钟以及同步更新外部时钟。音频重采样以及变速变调处理的流程大体就这样了。

当音频输出设备回调填充PCM数据方法时,我们的播放器将会通过void pcmQueueCallback(uint8_t *stream, int len); 方法调用音频重采样器进行处理,代码如下:

void MediaPlayer::pcmQueueCallback(uint8_t *stream, int len) {
    if (!audioResampler) {
        memset(stream, 0, sizeof(len));
        return;
    }
    audioResampler->pcmQueueCallback(stream, len);
}

至此,音频重采样以及变速变调处理就讲解完了。
完整代码请参考本人的播放器项目:CainPlayer

你可能感兴趣的:(《Android FFmpeg 播放器开发梳理》第四章 音频重采样与变速变调处理)