音频就是一段连续的波。
在AU里查看一首音乐,就是这样的界面。
把波形放大查看,
这绿绿的曲线,是不是很像股票,哈哈哈。
还是使用MediaInfo查看音频的属性,然后对照着说。
在真实的物理世界中,声音是连续的。而在电子世界中,数据都是离散的,不管是声音,还是拍照、录像。当采样的频率很大,大到人的眼睛和耳朵都分辨不出来,那么人就认为是连续的。
iPhone的Retina屏幕概念,显示器的抖动算法,视频的原理……,都是这个原理。
采样率最直白的解释就是:一秒钟采样多少个点。
最常见的音频采样率有三种:8kHz系列,44.1kHz系列和48kHz系列。
语音通话的采样率比音乐要低很多,这也就是另一篇博文里我说音乐下采样到通话中时,音质下降很厉害的原因。
在AU里直观查看,
图中一个个的点就是音频的采样点,44.1kHz意思就是:每一秒钟的音频,有44100个点。
采样率描述的是:采样点在横坐标——时间维度上的密集程度。
位深描述的是:采样点在纵坐标——声压维度上的精度。打个比方:假如最小音量是1,最大音量是100,位深描述的就是精确到小数点后几位。如果不要小数点,音量就只能有100级;如果精确到小数点1位,音量就能有1000级,描述音量就会更加精准、细腻。
就是有几条音轨,反应在AU里,就是有几个绿条。
音乐一般都是双声道,电影有2.1声道,或者Dolby环绕立体声,包括:5.1声音、7.1声道14.1声道等,其中.1表示低音。
安卓系统在system/media/audio/include/system/audio-base.h里定义了很多的声道:
185enum {
186 AUDIO_CHANNEL_REPRESENTATION_POSITION = 0x0u,
187 AUDIO_CHANNEL_REPRESENTATION_INDEX = 0x2u,
188 AUDIO_CHANNEL_NONE = 0x0u,
189 AUDIO_CHANNEL_INVALID = 0xC0000000u,
190
191 AUDIO_CHANNEL_OUT_FRONT_LEFT = 0x1u,
192 AUDIO_CHANNEL_OUT_FRONT_RIGHT = 0x2u,
193 AUDIO_CHANNEL_OUT_FRONT_CENTER = 0x4u,
194 AUDIO_CHANNEL_OUT_LOW_FREQUENCY = 0x8u,
195 AUDIO_CHANNEL_OUT_BACK_LEFT = 0x10u,
196 AUDIO_CHANNEL_OUT_BACK_RIGHT = 0x20u,
197 AUDIO_CHANNEL_OUT_FRONT_LEFT_OF_CENTER = 0x40u,
198 AUDIO_CHANNEL_OUT_FRONT_RIGHT_OF_CENTER = 0x80u,
199 AUDIO_CHANNEL_OUT_BACK_CENTER = 0x100u,
200 AUDIO_CHANNEL_OUT_SIDE_LEFT = 0x200u,
201 AUDIO_CHANNEL_OUT_SIDE_RIGHT = 0x400u,
202 AUDIO_CHANNEL_OUT_TOP_CENTER = 0x800u,
203 AUDIO_CHANNEL_OUT_TOP_FRONT_LEFT = 0x1000u,
204 AUDIO_CHANNEL_OUT_TOP_FRONT_CENTER = 0x2000u,
205 AUDIO_CHANNEL_OUT_TOP_FRONT_RIGHT = 0x4000u,
206 AUDIO_CHANNEL_OUT_TOP_BACK_LEFT = 0x8000u,
207 AUDIO_CHANNEL_OUT_TOP_BACK_CENTER = 0x10000u,
208 AUDIO_CHANNEL_OUT_TOP_BACK_RIGHT = 0x20000u,
209 AUDIO_CHANNEL_OUT_TOP_SIDE_LEFT = 0x40000u,
210 AUDIO_CHANNEL_OUT_TOP_SIDE_RIGHT = 0x80000u,
211 AUDIO_CHANNEL_OUT_MONO = 0x1u, // OUT_FRONT_LEFT
212 AUDIO_CHANNEL_OUT_STEREO = 0x3u, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT
213 AUDIO_CHANNEL_OUT_2POINT1 = 0xBu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_LOW_FREQUENCY
214 AUDIO_CHANNEL_OUT_2POINT0POINT2 = 0xC0003u, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT
215 AUDIO_CHANNEL_OUT_2POINT1POINT2 = 0xC000Bu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT | OUT_LOW_FREQUENCY
216 AUDIO_CHANNEL_OUT_3POINT0POINT2 = 0xC0007u, // OUT_FRONT_LEFT | OUT_FRONT_CENTER | OUT_FRONT_RIGHT | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT
217 AUDIO_CHANNEL_OUT_3POINT1POINT2 = 0xC000Fu, // OUT_FRONT_LEFT | OUT_FRONT_CENTER | OUT_FRONT_RIGHT | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT | OUT_LOW_FREQUENCY
218 AUDIO_CHANNEL_OUT_QUAD = 0x33u, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_BACK_LEFT | OUT_BACK_RIGHT
219 AUDIO_CHANNEL_OUT_QUAD_BACK = 0x33u, // OUT_QUAD
220 AUDIO_CHANNEL_OUT_QUAD_SIDE = 0x603u, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_SIDE_LEFT | OUT_SIDE_RIGHT
221 AUDIO_CHANNEL_OUT_SURROUND = 0x107u, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_FRONT_CENTER | OUT_BACK_CENTER
222 AUDIO_CHANNEL_OUT_PENTA = 0x37u, // OUT_QUAD | OUT_FRONT_CENTER
223 AUDIO_CHANNEL_OUT_5POINT1 = 0x3Fu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_FRONT_CENTER | OUT_LOW_FREQUENCY | OUT_BACK_LEFT | OUT_BACK_RIGHT
224 AUDIO_CHANNEL_OUT_5POINT1_BACK = 0x3Fu, // OUT_5POINT1
225 AUDIO_CHANNEL_OUT_5POINT1_SIDE = 0x60Fu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_FRONT_CENTER | OUT_LOW_FREQUENCY | OUT_SIDE_LEFT | OUT_SIDE_RIGHT
226 AUDIO_CHANNEL_OUT_5POINT1POINT2 = 0xC003Fu, // OUT_5POINT1 | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT
227 AUDIO_CHANNEL_OUT_5POINT1POINT4 = 0x2D03Fu, // OUT_5POINT1 | OUT_TOP_FRONT_LEFT | OUT_TOP_FRONT_RIGHT | OUT_TOP_BACK_LEFT | OUT_TOP_BACK_RIGHT
228 AUDIO_CHANNEL_OUT_6POINT1 = 0x13Fu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_FRONT_CENTER | OUT_LOW_FREQUENCY | OUT_BACK_LEFT | OUT_BACK_RIGHT | OUT_BACK_CENTER
229 AUDIO_CHANNEL_OUT_7POINT1 = 0x63Fu, // OUT_FRONT_LEFT | OUT_FRONT_RIGHT | OUT_FRONT_CENTER | OUT_LOW_FREQUENCY | OUT_BACK_LEFT | OUT_BACK_RIGHT | OUT_SIDE_LEFT | OUT_SIDE_RIGHT
230 AUDIO_CHANNEL_OUT_7POINT1POINT2 = 0xC063Fu, // OUT_7POINT1 | OUT_TOP_SIDE_LEFT | OUT_TOP_SIDE_RIGHT
231 AUDIO_CHANNEL_OUT_7POINT1POINT4 = 0x2D63Fu, // OUT_7POINT1 | OUT_TOP_FRONT_LEFT | OUT_TOP_FRONT_RIGHT | OUT_TOP_BACK_LEFT | OUT_TOP_BACK_RIGHT
232
233 AUDIO_CHANNEL_IN_LEFT = 0x4u,
234 AUDIO_CHANNEL_IN_RIGHT = 0x8u,
235 AUDIO_CHANNEL_IN_FRONT = 0x10u,
236 AUDIO_CHANNEL_IN_BACK = 0x20u,
237 AUDIO_CHANNEL_IN_LEFT_PROCESSED = 0x40u,
238 AUDIO_CHANNEL_IN_RIGHT_PROCESSED = 0x80u,
239 AUDIO_CHANNEL_IN_FRONT_PROCESSED = 0x100u,
240 AUDIO_CHANNEL_IN_BACK_PROCESSED = 0x200u,
241 AUDIO_CHANNEL_IN_PRESSURE = 0x400u,
242 AUDIO_CHANNEL_IN_X_AXIS = 0x800u,
243 AUDIO_CHANNEL_IN_Y_AXIS = 0x1000u,
244 AUDIO_CHANNEL_IN_Z_AXIS = 0x2000u,
245 AUDIO_CHANNEL_IN_BACK_LEFT = 0x10000u,
246 AUDIO_CHANNEL_IN_BACK_RIGHT = 0x20000u,
247 AUDIO_CHANNEL_IN_CENTER = 0x40000u,
248 AUDIO_CHANNEL_IN_LOW_FREQUENCY = 0x100000u,
249 AUDIO_CHANNEL_IN_TOP_LEFT = 0x200000u,
250 AUDIO_CHANNEL_IN_TOP_RIGHT = 0x400000u,
251 AUDIO_CHANNEL_IN_VOICE_UPLINK = 0x4000u,
252 AUDIO_CHANNEL_IN_VOICE_DNLINK = 0x8000u,
253 AUDIO_CHANNEL_IN_MONO = 0x10u, // IN_FRONT
254 AUDIO_CHANNEL_IN_STEREO = 0xCu, // IN_LEFT | IN_RIGHT
255 AUDIO_CHANNEL_IN_FRONT_BACK = 0x30u, // IN_FRONT | IN_BACK
256 AUDIO_CHANNEL_IN_6 = 0xFCu, // IN_LEFT | IN_RIGHT | IN_FRONT | IN_BACK | IN_LEFT_PROCESSED | IN_RIGHT_PROCESSED
257 AUDIO_CHANNEL_IN_2POINT0POINT2 = 0x60000Cu, // IN_LEFT | IN_RIGHT | IN_TOP_LEFT | IN_TOP_RIGHT
258 AUDIO_CHANNEL_IN_2POINT1POINT2 = 0x70000Cu, // IN_LEFT | IN_RIGHT | IN_TOP_LEFT | IN_TOP_RIGHT | IN_LOW_FREQUENCY
259 AUDIO_CHANNEL_IN_3POINT0POINT2 = 0x64000Cu, // IN_LEFT | IN_CENTER | IN_RIGHT | IN_TOP_LEFT | IN_TOP_RIGHT
260 AUDIO_CHANNEL_IN_3POINT1POINT2 = 0x74000Cu, // IN_LEFT | IN_CENTER | IN_RIGHT | IN_TOP_LEFT | IN_TOP_RIGHT | IN_LOW_FREQUENCY
261 AUDIO_CHANNEL_IN_5POINT1 = 0x17000Cu, // IN_LEFT | IN_CENTER | IN_RIGHT | IN_BACK_LEFT | IN_BACK_RIGHT | IN_LOW_FREQUENCY
262 AUDIO_CHANNEL_IN_VOICE_UPLINK_MONO = 0x4010u, // IN_VOICE_UPLINK | IN_MONO
263 AUDIO_CHANNEL_IN_VOICE_DNLINK_MONO = 0x8010u, // IN_VOICE_DNLINK | IN_MONO
264 AUDIO_CHANNEL_IN_VOICE_CALL_MONO = 0xC010u, // IN_VOICE_UPLINK_MONO | IN_VOICE_DNLINK_MONO
最常见的音频格式是 MP3。还有一些非常普遍的,比如:Flac、Ape、AAC、Ogg、Opus等。
安卓系统在/system/media/audio/include/system/audio-base.h定义的音频格式如下。
71typedef enum {
72 AUDIO_FORMAT_INVALID = 0xFFFFFFFFu,
73 AUDIO_FORMAT_DEFAULT = 0,
74 AUDIO_FORMAT_PCM = 0x00000000u,
75 AUDIO_FORMAT_MP3 = 0x01000000u,
76 AUDIO_FORMAT_AMR_NB = 0x02000000u,
77 AUDIO_FORMAT_AMR_WB = 0x03000000u,
78 AUDIO_FORMAT_AAC = 0x04000000u,
79 AUDIO_FORMAT_HE_AAC_V1 = 0x05000000u,
80 AUDIO_FORMAT_HE_AAC_V2 = 0x06000000u,
81 AUDIO_FORMAT_VORBIS = 0x07000000u,
82 AUDIO_FORMAT_OPUS = 0x08000000u,
83 AUDIO_FORMAT_AC3 = 0x09000000u,
84 AUDIO_FORMAT_E_AC3 = 0x0A000000u,
85 AUDIO_FORMAT_DTS = 0x0B000000u,
86 AUDIO_FORMAT_DTS_HD = 0x0C000000u,
87 AUDIO_FORMAT_IEC61937 = 0x0D000000u,
88 AUDIO_FORMAT_DOLBY_TRUEHD = 0x0E000000u,
89 AUDIO_FORMAT_EVRC = 0x10000000u,
90 AUDIO_FORMAT_EVRCB = 0x11000000u,
91 AUDIO_FORMAT_EVRCWB = 0x12000000u,
92 AUDIO_FORMAT_EVRCNW = 0x13000000u,
93 AUDIO_FORMAT_AAC_ADIF = 0x14000000u,
94 AUDIO_FORMAT_WMA = 0x15000000u,
95 AUDIO_FORMAT_WMA_PRO = 0x16000000u,
96 AUDIO_FORMAT_AMR_WB_PLUS = 0x17000000u,
97 AUDIO_FORMAT_MP2 = 0x18000000u,
98 AUDIO_FORMAT_QCELP = 0x19000000u,
99 AUDIO_FORMAT_DSD = 0x1A000000u,
100 AUDIO_FORMAT_FLAC = 0x1B000000u,
101 AUDIO_FORMAT_ALAC = 0x1C000000u,
102 AUDIO_FORMAT_APE = 0x1D000000u,
103 AUDIO_FORMAT_AAC_ADTS = 0x1E000000u,
104 AUDIO_FORMAT_SBC = 0x1F000000u,
105 AUDIO_FORMAT_APTX = 0x20000000u,
106 AUDIO_FORMAT_APTX_HD = 0x21000000u,
107 AUDIO_FORMAT_AC4 = 0x22000000u,
108 AUDIO_FORMAT_LDAC = 0x23000000u,
109 AUDIO_FORMAT_MAT = 0x24000000u,
110 AUDIO_FORMAT_MAIN_MASK = 0xFF000000u,
111 AUDIO_FORMAT_SUB_MASK = 0x00FFFFFFu,
112
113 /* Subformats */
114 AUDIO_FORMAT_PCM_SUB_16_BIT = 0x1u,
115 AUDIO_FORMAT_PCM_SUB_8_BIT = 0x2u,
116 AUDIO_FORMAT_PCM_SUB_32_BIT = 0x3u,
117 AUDIO_FORMAT_PCM_SUB_8_24_BIT = 0x4u,
118 AUDIO_FORMAT_PCM_SUB_FLOAT = 0x5u,
119 AUDIO_FORMAT_PCM_SUB_24_BIT_PACKED = 0x6u,
120
121 AUDIO_FORMAT_MP3_SUB_NONE = 0x0u,
122
123 AUDIO_FORMAT_AMR_SUB_NONE = 0x0u,
124
125 AUDIO_FORMAT_AAC_SUB_MAIN = 0x1u,
126 AUDIO_FORMAT_AAC_SUB_LC = 0x2u,
127 AUDIO_FORMAT_AAC_SUB_SSR = 0x4u,
128 AUDIO_FORMAT_AAC_SUB_LTP = 0x8u,
129 AUDIO_FORMAT_AAC_SUB_HE_V1 = 0x10u,
130 AUDIO_FORMAT_AAC_SUB_SCALABLE = 0x20u,
131 AUDIO_FORMAT_AAC_SUB_ERLC = 0x40u,
132 AUDIO_FORMAT_AAC_SUB_LD = 0x80u,
133 AUDIO_FORMAT_AAC_SUB_HE_V2 = 0x100u,
134 AUDIO_FORMAT_AAC_SUB_ELD = 0x200u,
135 AUDIO_FORMAT_AAC_SUB_XHE = 0x300u,
136
137 AUDIO_FORMAT_VORBIS_SUB_NONE = 0x0u,
138
139 AUDIO_FORMAT_E_AC3_SUB_JOC = 0x1u,
140
141 AUDIO_FORMAT_MAT_SUB_1_0 = 0x1u,
142 AUDIO_FORMAT_MAT_SUB_2_0 = 0x2u,
143 AUDIO_FORMAT_MAT_SUB_2_1 = 0x3u,
144
145 /* Aliases */
146 AUDIO_FORMAT_PCM_16_BIT = 0x1u, // (PCM | PCM_SUB_16_BIT)
147 AUDIO_FORMAT_PCM_8_BIT = 0x2u, // (PCM | PCM_SUB_8_BIT)
148 AUDIO_FORMAT_PCM_32_BIT = 0x3u, // (PCM | PCM_SUB_32_BIT)
149 AUDIO_FORMAT_PCM_8_24_BIT = 0x4u, // (PCM | PCM_SUB_8_24_BIT)
150 AUDIO_FORMAT_PCM_FLOAT = 0x5u, // (PCM | PCM_SUB_FLOAT)
151 AUDIO_FORMAT_PCM_24_BIT_PACKED = 0x6u, // (PCM | PCM_SUB_24_BIT_PACKED)
152 AUDIO_FORMAT_AAC_MAIN = 0x4000001u, // (AAC | AAC_SUB_MAIN)
153 AUDIO_FORMAT_AAC_LC = 0x4000002u, // (AAC | AAC_SUB_LC)
154 AUDIO_FORMAT_AAC_SSR = 0x4000004u, // (AAC | AAC_SUB_SSR)
155 AUDIO_FORMAT_AAC_LTP = 0x4000008u, // (AAC | AAC_SUB_LTP)
156 AUDIO_FORMAT_AAC_HE_V1 = 0x4000010u, // (AAC | AAC_SUB_HE_V1)
157 AUDIO_FORMAT_AAC_SCALABLE = 0x4000020u, // (AAC | AAC_SUB_SCALABLE)
158 AUDIO_FORMAT_AAC_ERLC = 0x4000040u, // (AAC | AAC_SUB_ERLC)
159 AUDIO_FORMAT_AAC_LD = 0x4000080u, // (AAC | AAC_SUB_LD)
160 AUDIO_FORMAT_AAC_HE_V2 = 0x4000100u, // (AAC | AAC_SUB_HE_V2)
161 AUDIO_FORMAT_AAC_ELD = 0x4000200u, // (AAC | AAC_SUB_ELD)
162 AUDIO_FORMAT_AAC_XHE = 0x4000300u, // (AAC | AAC_SUB_XHE)
163 AUDIO_FORMAT_AAC_ADTS_MAIN = 0x1e000001u, // (AAC_ADTS | AAC_SUB_MAIN)
164 AUDIO_FORMAT_AAC_ADTS_LC = 0x1e000002u, // (AAC_ADTS | AAC_SUB_LC)
165 AUDIO_FORMAT_AAC_ADTS_SSR = 0x1e000004u, // (AAC_ADTS | AAC_SUB_SSR)
166 AUDIO_FORMAT_AAC_ADTS_LTP = 0x1e000008u, // (AAC_ADTS | AAC_SUB_LTP)
167 AUDIO_FORMAT_AAC_ADTS_HE_V1 = 0x1e000010u, // (AAC_ADTS | AAC_SUB_HE_V1)
168 AUDIO_FORMAT_AAC_ADTS_SCALABLE = 0x1e000020u, // (AAC_ADTS | AAC_SUB_SCALABLE)
169 AUDIO_FORMAT_AAC_ADTS_ERLC = 0x1e000040u, // (AAC_ADTS | AAC_SUB_ERLC)
170 AUDIO_FORMAT_AAC_ADTS_LD = 0x1e000080u, // (AAC_ADTS | AAC_SUB_LD)
171 AUDIO_FORMAT_AAC_ADTS_HE_V2 = 0x1e000100u, // (AAC_ADTS | AAC_SUB_HE_V2)
172 AUDIO_FORMAT_AAC_ADTS_ELD = 0x1e000200u, // (AAC_ADTS | AAC_SUB_ELD)
173 AUDIO_FORMAT_AAC_ADTS_XHE = 0x1e000300u, // (AAC_ADTS | AAC_SUB_XHE)
174 AUDIO_FORMAT_E_AC3_JOC = 0xA000001u, // (E_AC3 | E_AC3_SUB_JOC)
175 AUDIO_FORMAT_MAT_1_0 = 0x24000001u, // (MAT | MAT_SUB_1_0)
176 AUDIO_FORMAT_MAT_2_0 = 0x24000002u, // (MAT | MAT_SUB_2_0)
177 AUDIO_FORMAT_MAT_2_1 = 0x24000003u, // (MAT | MAT_SUB_2_1)
178} audio_format_t;
对于一个未压缩的双声道、16bit位深、44.1kHz采样率的音频,它的码率是:2 * 16 * 44100/8,这个码率对于早期的网络来说,压力还是比较大的。
而经过MP3、AAC这样的压缩格式后,码率能显著降低到100 ~ 320kbps左右,而且音质损失在可控程度。
音频的压缩主要是基于频率掩蔽效应和时域掩蔽效应,以及人的听力频率范围20 ~ 20kHz。
与视频压缩的Profile类似,不同的Profile适用不同的场景,AAC格式下最常见的Profile就是LC,Low Complex,低复杂度规格。
MP3、AAC、Ogg都属于有损压缩格式,意思是:压缩后,再解压回来,已经恢复不到原本的样子了。
音频的帧率比较有特点,音频的帧率与音频格式和采样率有关。
也就是说,如果采样率不同,帧率也是不同的。
截图中的帧率 43.066 = 44100/1024。
使用ffmpeg对音频文件进行转格式。
ffmpeg -i input.mp3 -c:a aac output.aac
使用特定的编码器进行编码。
ffmpeg -i input.mp3 -c:a libfdk_aac output.aac
转码时指定Profile、码率等其他参数。
ffmpeg -i input.mp3 -c:a libfdk_aac -profile:a lc -b:a 160k output.aac