FFmpeg例子:resampling_audio分析

FFmpeg版本:3.4.2

FFmpeg的官方例子中的resampling_audio,从名字上看它是一个对音频重采样的例子。但实际上除了重采样,它还能转换采样格式和声道数。

重采样:举个例子,将一个采样率为44100的音频转换成采样率为8000的音频,这个过程就叫做音频的重采样。由于源音频和目标音频的采样率之比不一定是整数,为了重采样后的目标音频尽可能地不失真,其中涉及到的算法是很复杂的。

声道:常见的音频有立体声(stereo)和单声道(mono)两种类型,另外还有环绕立体声等其它不太常用的类型。立体声包含左声道和右声道。

采样格式:数字音频本质上是由很多个“采样”组成的。以不同声道的采样排列方式来区分,采样格式可以分为平坦(planar)和非平坦两种类型;除此之外还会以采样的数值精度和数值类型来区分采样格式。

平坦形式是指各个声道的采样都由各自的数组储存,有多少个声道就有多少个数组;非平坦形式则只有一条数组,所有声道的采样都交错储存在同一个数组中。由于单声道只有一个声道,所以平坦和非平坦储存都是一样的。以立体声为例:

FFmpeg例子:resampling_audio分析_第1张图片
平坦和非平坦采样排列方式示意图

另外,根据采样的取值范围和类型来区分不同的格式:
AV_SAMPLE_FMT_U8:无符号8位整型
AV_SAMPLE_FMT_S16: 带符号16位整型
AV_SAMPLE_FMT_S32: 带符号32位整型
AV_SAMPLE_FMT_S64: 带符号64位整型
AV_SAMPLE_FMT_FLT: float
AV_SAMPLE_FMT_DBL: double

知道上面这些概念,代码就就很好理解了。

  1. 初始化
int64_t src_ch_layout = AV_CH_LAYOUT_STEREO;        // 输入立体声
int64_t dst_ch_layout = AV_CH_LAYOUT_SURROUND;   // 输出环绕立体声

int src_rate = 48000;
Int dst_rate = 11025;    // 例子中的输出采样率为44100,但为了更直观地看出转换后的变化,所以选择一个小一点的输出采样率。

/* 创建Context */
struct SwrContext *swr_ctx = swr_alloc();
/* 设置参数 */
av_opt_set_int(swr_ctx, "in_channel_layout",    src_ch_layout, 0);
av_opt_set_int(swr_ctx, "in_sample_rate",       src_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", src_sample_fmt, 0);

av_opt_set_int(swr_ctx, "out_channel_layout",    dst_ch_layout, 0);
av_opt_set_int(swr_ctx, "out_sample_rate",       dst_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", dst_sample_fmt, 0);
  1. 分配缓冲数组空间
/* 分配输入缓冲空间 */
uint8_t **src_data = NULL;
src_nb_channels = av_get_channel_layout_nb_channels(src_ch_layout);
ret = av_samples_alloc_array_and_samples(&src_data, &src_linesize, src_nb_channels,
                                             src_nb_samples, src_sample_fmt, 0);

/* 分配输出缓冲空间 */
Uint8_t **dst_data = NULL;
dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);
ret = av_samples_alloc_array_and_samples(&dst_data, &dst_linesize, dst_nb_channels,
                                             dst_nb_samples, dst_sample_fmt, 0);
  1. 填充输入数据
/**
 * 将一段正弦波作为测试音频填充到输入数组中
 */
static void fill_samples(double *dst, int nb_samples, int nb_channels, int sample_rate, double *t)
{
    int i, j;
    double tincr = 1.0 / sample_rate, *dstp = dst;
    const double c = 2 * M_PI * 440.0;

    /* generate sin tone with 440Hz frequency and duplicated channels */
    for (i = 0; i < nb_samples; i++) {
        *dstp = sin(c * *t);
        for (j = 1; j < nb_channels; j++)
            dstp[j] = dstp[0];
        dstp += nb_channels;
        *t += tincr;
    }
}
/* generate synthetic audio */
fill_samples((double *)src_data[0], src_nb_samples, src_nb_channels, src_rate, &t);

例子中的输入采样格式是AV_SAMPLE_FMT_DBL,采样数值类型为double,为非平坦格式,所有数据都存在src_data[0]这个数组中。

  1. 转换并输出数据
/* 转换 */
ret = swr_convert(swr_ctx, dst_data, dst_nb_samples, (const uint8_t **)src_data, src_nb_samples);
if (ret < 0) {
    fprintf(stderr, "Error while converting\n");
    goto end;
}
// 获取转换后的数据长度
dst_bufsize = av_samples_get_buffer_size(&dst_linesize, dst_nb_channels,
                                                 ret, dst_sample_fmt, 1);
if (dst_bufsize < 0) {
    fprintf(stderr, "Could not get sample buffer size\n");
    goto end;
}
printf("t:%f in:%d out:%d\n", t, src_nb_samples, ret);
// 输出数据到文件
fwrite(dst_data[0], 1, dst_bufsize, dst_file);

由于例子中目标采样格式是AV_SAMPLE_FMT_S16,为非平坦格式,所有输出数据都存在dst_data[0]这个数组中

  1. 结束
/* 释放缓冲数组 */
if (src_data)
    av_freep(&src_data[0]);
av_freep(&src_data);
if (dst_data)
    av_freep(&dst_data[0]);
av_freep(&dst_data);
/* 释放Context */
swr_free(&swr_ctx);
  1. 将输入和输出的音频用Audacity打开,可以直观地看到转换前后的变化。这里有一个问题:为什么输出音频的第三声道的波形是一条直线?
    FFmpeg例子:resampling_audio分析_第2张图片
    audacity.png
  2. 总结:这个例子只展示非平坦采样格式的音频转换,没有包含平坦与非平坦采样格式的音频转换功能,刚开始我并不熟悉这两者的区别,导致音频转换失败。下面我对这个例子进行重构,让它能适用于更多种音频格式的转换。

convertor.h

//
//  convertor.h
//  SoundEditor
//
//  Created by Kidon Liang on 2018/4/1.
//  Copyright © 2018年 Kidon Liang. All rights reserved.
//

#ifndef convertor_h
#define convertor_h

#include 
#include 
#include 
#include 

/**
 * 初始化
 *
 * @param src_ch_layout    输入声道类型
 * @param src_sample_fmt   输入采样格式
 * @param src_sample_rate  输入采样率
 * @param dst_ch_layout    输出声道类型
 * @param dst_sample_fmt   输出采样格式
 * @param dst_sample_rate  输出采样率
 **/
int convertor_init(int64_t src_ch_layout, enum AVSampleFormat src_sample_fmt, int src_sample_rate,
                   int64_t dst_ch_layout, enum AVSampleFormat dst_sample_fmt, int dst_sample_rate);

/**
 * 输入数据
 *
 * @param data 音频数据
 * @param len  数据长度
 **/
int convertor_feed_data(uint8_t **data, int len);

/**
 * 获取已转换数据长度
 **/
int convertor_get_converted_size(void);

/**
 * 接收已转换数据
 *
 * @param data 接收数据的数组
 **/
int convertor_receive_converted_data(uint8_t **data);

/**
 * 排空所有数据
 **/
void convertor_flush(void);

/**
 * 关闭转换器
 **/
void convertor_close(void);

#endif /* convertor_h */

convertor.c

//
//  convertor.c
//  SoundEditor
//
//  Created by Kidon Liang on 2018/4/1.
//  Copyright © 2018年 Kidon Liang. All rights reserved.
//

#include "convertor.h"

static int tmp_ret = 0;
static struct SwrContext *swr_ctx = NULL;
static uint8_t **src_buffers = NULL;
static uint8_t **dst_buffers = NULL;

static enum AVSampleFormat src_sample_fmt;
static enum AVSampleFormat dst_sample_fmt;

static int src_sample_rate;
static int dst_sample_rate;

static int src_nb_samples = 1024;  // ffmpeg默认每次采样数为1024
static int dst_nb_samples;
static int max_dst_nb_samples;     // 用于记录最大的输出采样数,防止数组越界

static int src_linesize;
static int dst_linesize;

static int src_nb_channels;
static int src_nb_buffers;

static int dst_nb_channels;
static int dst_nb_buffers;

int convertor_init(int64_t src_ch_layout, enum AVSampleFormat src_sp_fmt, int src_sp_rate,
                   int64_t dst_ch_layout, enum AVSampleFormat dst_sp_fmt, int dst_sp_rate) {
    
    src_sample_fmt = src_sp_fmt;
    dst_sample_fmt = dst_sp_fmt;
    
    src_sample_rate = src_sp_rate;
    dst_sample_rate = dst_sp_rate;
    
    src_nb_channels = av_get_channel_layout_nb_channels(src_ch_layout);
    src_nb_buffers = av_sample_fmt_is_planar(src_sample_fmt) ? src_nb_channels : 1;
    
    dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);
    dst_nb_buffers = av_sample_fmt_is_planar(dst_sample_fmt) ? dst_nb_channels : 1;
    
    // init
    swr_ctx = swr_alloc();
    if (!swr_ctx) {
        printf("can not alloc SwrContext.");
        convertor_close();
        return -1;
    }
    
    /* set options */
    av_opt_set_int(swr_ctx, "in_channel_layout",    src_ch_layout, 0);
    av_opt_set_int(swr_ctx, "in_sample_rate",       src_sample_rate, 0);
    av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", src_sample_fmt, 0);
    
    av_opt_set_int(swr_ctx, "out_channel_layout",    dst_ch_layout, 0);
    av_opt_set_int(swr_ctx, "out_sample_rate",       dst_sample_rate, 0);
    av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", dst_sample_fmt, 0);
    
    /* initialize the resampling context */
    if (swr_init(swr_ctx) < 0) {
        printf("Failed to initialize the resampling context\n");
        convertor_close();
        return -2;
    }
    
    // allocate samples buffers
    tmp_ret = av_samples_alloc_array_and_samples(&src_buffers, &src_linesize, src_nb_channels, src_nb_samples, src_sample_fmt, 0);
    printf("src_linesize=%d\n", src_linesize);
    if (tmp_ret < 0) {
        printf("Could not allocate source samples\n");
        convertor_close();
        return -3;
    }
    max_dst_nb_samples = dst_nb_samples =
    (int)av_rescale_rnd(src_nb_samples, dst_sample_rate, src_sample_rate, AV_ROUND_UP);
    
    tmp_ret = av_samples_alloc_array_and_samples(&dst_buffers, &dst_linesize, dst_nb_channels,
                                             dst_nb_samples, dst_sample_fmt, 0);
    printf("dst_linesize=%d\n", dst_linesize);
    if (tmp_ret < 0) {
        printf("Could not allocate destination samples\n");
        convertor_close();
        return -4;
    }
    return 0;
}

static int converted_size = 0;

/**
 * 输入数据,需要确保每次输入1024个样本数据。
 **/
int convertor_feed_data(uint8_t **data, int len) {
    // 填充数据
    for (int i=0;i max_dst_nb_samples) {
        // 重新分配输出buffer
        if (dst_buffers) {
            av_freep(&dst_buffers[0]);
        }
        tmp_ret = av_samples_alloc(dst_buffers, &dst_linesize, dst_nb_channels,
                               dst_nb_samples, dst_sample_fmt, 1);
        printf("realloc dst_linesize=%d\n", dst_linesize);
        if (tmp_ret < 0) {
            printf("重新分配输出buffer失败\n");
            convertor_close();
            return -1;
        }
        max_dst_nb_samples = dst_nb_samples;
    }
    
    /* convert to destination format */
    tmp_ret = swr_convert(swr_ctx, dst_buffers, dst_nb_samples, (const uint8_t **)src_buffers, src_nb_samples);
    if (tmp_ret < 0) {
        printf("Error while converting\n");
        convertor_close();
        return -2;
    }
    converted_size = av_samples_get_buffer_size(&dst_linesize, dst_nb_channels,
                                             tmp_ret, dst_sample_fmt, 1);
    if (converted_size < 0) {
        printf("Could not get sample buffer size\n");
        convertor_close();
        return -3;
    }
    return 0;
}

int convertor_get_converted_size(void) {
    return converted_size;
}

int convertor_receive_converted_data(uint8_t **data) {
    tmp_ret = converted_size;
    for (int i=0;i

main.c

//
//  main.c
//  SoundEditor
//
//  Created by Kidon Liang on 2018/3/31.
//  Copyright © 2018年 Kidon Liang. All rights reserved.
//

#include 
#include "convertor.h"

static void fill_samples_float(float **data, enum AVSampleFormat sample_fmt,
                         int nb_samples, int nb_channels, int sample_rate, double *time) {
    int i, j;
    double tincr = 1.0 / sample_rate;
    const double c = 2 * M_PI * 440.0;
    double val;
    
    if (av_sample_fmt_is_planar(sample_fmt)) {
        for (i = 0; i < nb_samples; i++) {
            val = sin(c * *time);
            for (j = 0; j < nb_channels; j++) {
                data[j][i] = val;
            }
            *time += tincr;
        }
    } else {
        float *data_p = data[0];
        for (i = 0; i < nb_samples; i++) {
            val = sin(c * *time);
            for (j = 0; j < nb_channels; j++) {
                data_p[j] = val;
            }
            data_p += nb_channels;
            *time += tincr;
        }
    }
}

static void fill_samples_double(double **data, enum AVSampleFormat sample_fmt,
                               int nb_samples, int nb_channels, int sample_rate, double *time) {
    int i, j;
    double tincr = 1.0 / sample_rate;
    const double c = 2 * M_PI * 440.0;
    double val;
    
    if (av_sample_fmt_is_planar(sample_fmt)) {
        for (i = 0; i < nb_samples; i++) {
            val = sin(c * *time);
            for (j = 0; j < nb_channels; j++) {
                data[j][i] = val;
            }
            *time += tincr;
        }
    } else {
        double *data_p = data[0];
        for (i = 0; i < nb_samples; i++) {
            val = sin(c * *time);
            for (j = 0; j < nb_channels; j++) {
                data_p[j] = val;
            }
            data_p += nb_channels;
            *time += tincr;
        }
    }
}

static void fill_samples_16(int16_t **data, enum AVSampleFormat sample_fmt,
                            int nb_samples, int nb_channels, int sample_rate, double *time) {
    int i, j;
    double tincr = 1.0 / sample_rate;
    const double c = 2 * M_PI * 440.0;
    double val;
    
    if (av_sample_fmt_is_planar(sample_fmt)) {
        for (i = 0; i < nb_samples; i++) {
            val = (32768 - 1) * sin(c * *time);
            for (j = 0; j < nb_channels; j++) {
                data[j][i] = val;
            }
            *time += tincr;
        }
    } else {
        int16_t *data_p = data[0];
        for (i = 0; i < nb_samples; i++) {
            val = (32768 - 1) * sin(c * *time);
//            val = (32768 - 1) * 0.25;
            for (j = 0; j < nb_channels; j++) {
//                data_p[j] = (-1 + j*2) * val;
                data_p[j] = val;
            }
            data_p += nb_channels;
            *time += tincr;
        }
    }
}

static void fill_samples(uint8_t **data, enum AVSampleFormat sample_fmt,
                         int nb_samples, int nb_channels, int sample_rate, double *time) {
    switch (sample_fmt) {
            
        case AV_SAMPLE_FMT_S16:
        case AV_SAMPLE_FMT_S16P:
            fill_samples_16((int16_t **) data, sample_fmt, nb_samples, nb_channels, sample_rate, time);
            return;
            
        case AV_SAMPLE_FMT_FLT:
        case AV_SAMPLE_FMT_FLTP:
            fill_samples_float((float **) data, sample_fmt, nb_samples, nb_channels, sample_rate, time);
            return;
            
        case AV_SAMPLE_FMT_DBL:
        case AV_SAMPLE_FMT_DBLP:
            fill_samples_double((double **) data, sample_fmt, nb_samples, nb_channels, sample_rate, time);
            return;
            
        default:
            printf("其它格式的采样填充方法就不写了。");
            return;
    }
}

int main() {
    
    char *src_path = "/Users/kidonliang/Desktop/src-0.pcm";
    FILE *src_file = fopen(src_path, "wb");

    char *dst_path = "/Users/kidonliang/Desktop/dst-0.pcm";
    FILE *dst_file = fopen(dst_path, "wb");

    enum AVSampleFormat src_sample_fmt = AV_SAMPLE_FMT_S16;
    enum AVSampleFormat dst_sample_fmt = AV_SAMPLE_FMT_FLTP;

    int64_t src_ch_layout = AV_CH_LAYOUT_MONO;
    int64_t dst_ch_layout = AV_CH_LAYOUT_MONO;

    int src_sample_rate = 44100;
    int dst_sample_rate = 16000;

    int src_nb_channels = av_get_channel_layout_nb_channels(src_ch_layout);
    int dst_nb_channels = av_get_channel_layout_nb_channels(dst_ch_layout);

    convertor_init(src_ch_layout, src_sample_fmt, src_sample_rate,
                   dst_ch_layout, dst_sample_fmt, dst_sample_rate);

    double time = 0;
    uint8_t **src_buffers = NULL;
    int src_bytes_ps = av_get_bytes_per_sample(src_sample_fmt);
    // alloc source buffers
    if (av_sample_fmt_is_planar(src_sample_fmt)) {
        src_buffers = malloc(src_nb_channels * sizeof(void *));
        for (int i=0;i

你可能感兴趣的:(FFmpeg例子:resampling_audio分析)