af_atempo.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/avassert.h"
  42. #include "libavutil/avstring.h"
  43. #include "libavutil/channel_layout.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct {
  80. const AVClass *class;
  81. // ring-buffer of input samples, necessary because some times
  82. // input fragment position may be adjusted backwards:
  83. uint8_t *buffer;
  84. // ring-buffer maximum capacity, expressed in sample rate time base:
  85. int ring;
  86. // ring-buffer house keeping:
  87. int size;
  88. int head;
  89. int tail;
  90. // 0: input sample position corresponding to the ring buffer tail
  91. // 1: output sample position
  92. int64_t position[2];
  93. // sample format:
  94. enum AVSampleFormat format;
  95. // number of channels:
  96. int channels;
  97. // row of bytes to skip from one sample to next, across multple channels;
  98. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  99. int stride;
  100. // fragment window size, power-of-two integer:
  101. int window;
  102. // Hann window coefficients, for feathering
  103. // (blending) the overlapping fragment region:
  104. float *hann;
  105. // tempo scaling factor:
  106. double tempo;
  107. // a snapshot of previous fragment input and output position values
  108. // captured when the tempo scale factor was set most recently:
  109. int64_t origin[2];
  110. // current/previous fragment ring-buffer:
  111. AudioFragment frag[2];
  112. // current fragment index:
  113. uint64_t nfrag;
  114. // current state:
  115. FilterState state;
  116. // for fast correlation calculation in frequency domain:
  117. RDFTContext *real_to_complex;
  118. RDFTContext *complex_to_real;
  119. FFTSample *correlation;
  120. // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
  121. AVFrame *dst_buffer;
  122. uint8_t *dst;
  123. uint8_t *dst_end;
  124. uint64_t nsamples_in;
  125. uint64_t nsamples_out;
  126. } ATempoContext;
  127. #define OFFSET(x) offsetof(ATempoContext, x)
  128. static const AVOption atempo_options[] = {
  129. { "tempo", "set tempo scale factor",
  130. OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.5, 2.0,
  131. AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM },
  132. { NULL }
  133. };
  134. AVFILTER_DEFINE_CLASS(atempo);
  135. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  136. {
  137. return &atempo->frag[atempo->nfrag % 2];
  138. }
  139. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  140. {
  141. return &atempo->frag[(atempo->nfrag + 1) % 2];
  142. }
  143. /**
  144. * Reset filter to initial state, do not deallocate existing local buffers.
  145. */
  146. static void yae_clear(ATempoContext *atempo)
  147. {
  148. atempo->size = 0;
  149. atempo->head = 0;
  150. atempo->tail = 0;
  151. atempo->nfrag = 0;
  152. atempo->state = YAE_LOAD_FRAGMENT;
  153. atempo->position[0] = 0;
  154. atempo->position[1] = 0;
  155. atempo->origin[0] = 0;
  156. atempo->origin[1] = 0;
  157. atempo->frag[0].position[0] = 0;
  158. atempo->frag[0].position[1] = 0;
  159. atempo->frag[0].nsamples = 0;
  160. atempo->frag[1].position[0] = 0;
  161. atempo->frag[1].position[1] = 0;
  162. atempo->frag[1].nsamples = 0;
  163. // shift left position of 1st fragment by half a window
  164. // so that no re-normalization would be required for
  165. // the left half of the 1st fragment:
  166. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  167. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  168. av_frame_free(&atempo->dst_buffer);
  169. atempo->dst = NULL;
  170. atempo->dst_end = NULL;
  171. atempo->nsamples_in = 0;
  172. atempo->nsamples_out = 0;
  173. }
  174. /**
  175. * Reset filter to initial state and deallocate all buffers.
  176. */
  177. static void yae_release_buffers(ATempoContext *atempo)
  178. {
  179. yae_clear(atempo);
  180. av_freep(&atempo->frag[0].data);
  181. av_freep(&atempo->frag[1].data);
  182. av_freep(&atempo->frag[0].xdat);
  183. av_freep(&atempo->frag[1].xdat);
  184. av_freep(&atempo->buffer);
  185. av_freep(&atempo->hann);
  186. av_freep(&atempo->correlation);
  187. av_rdft_end(atempo->real_to_complex);
  188. atempo->real_to_complex = NULL;
  189. av_rdft_end(atempo->complex_to_real);
  190. atempo->complex_to_real = NULL;
  191. }
  192. /* av_realloc is not aligned enough; fortunately, the data does not need to
  193. * be preserved */
  194. #define RE_MALLOC_OR_FAIL(field, field_size) \
  195. do { \
  196. av_freep(&field); \
  197. field = av_malloc(field_size); \
  198. if (!field) { \
  199. yae_release_buffers(atempo); \
  200. return AVERROR(ENOMEM); \
  201. } \
  202. } while (0)
  203. /**
  204. * Prepare filter for processing audio data of given format,
  205. * sample rate and number of channels.
  206. */
  207. static int yae_reset(ATempoContext *atempo,
  208. enum AVSampleFormat format,
  209. int sample_rate,
  210. int channels)
  211. {
  212. const int sample_size = av_get_bytes_per_sample(format);
  213. uint32_t nlevels = 0;
  214. uint32_t pot;
  215. int i;
  216. atempo->format = format;
  217. atempo->channels = channels;
  218. atempo->stride = sample_size * channels;
  219. // pick a segment window size:
  220. atempo->window = sample_rate / 24;
  221. // adjust window size to be a power-of-two integer:
  222. nlevels = av_log2(atempo->window);
  223. pot = 1 << nlevels;
  224. av_assert0(pot <= atempo->window);
  225. if (pot < atempo->window) {
  226. atempo->window = pot * 2;
  227. nlevels++;
  228. }
  229. // initialize audio fragment buffers:
  230. RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  231. RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  232. RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  233. RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  234. // initialize rDFT contexts:
  235. av_rdft_end(atempo->real_to_complex);
  236. atempo->real_to_complex = NULL;
  237. av_rdft_end(atempo->complex_to_real);
  238. atempo->complex_to_real = NULL;
  239. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  240. if (!atempo->real_to_complex) {
  241. yae_release_buffers(atempo);
  242. return AVERROR(ENOMEM);
  243. }
  244. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  245. if (!atempo->complex_to_real) {
  246. yae_release_buffers(atempo);
  247. return AVERROR(ENOMEM);
  248. }
  249. RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  250. atempo->ring = atempo->window * 3;
  251. RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  252. // initialize the Hann window function:
  253. RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  254. for (i = 0; i < atempo->window; i++) {
  255. double t = (double)i / (double)(atempo->window - 1);
  256. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  257. atempo->hann[i] = (float)h;
  258. }
  259. yae_clear(atempo);
  260. return 0;
  261. }
  262. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  263. {
  264. const AudioFragment *prev;
  265. ATempoContext *atempo = ctx->priv;
  266. char *tail = NULL;
  267. double tempo = av_strtod(arg_tempo, &tail);
  268. if (tail && *tail) {
  269. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  270. return AVERROR(EINVAL);
  271. }
  272. if (tempo < 0.5 || tempo > 2.0) {
  273. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
  274. tempo);
  275. return AVERROR(EINVAL);
  276. }
  277. prev = yae_prev_frag(atempo);
  278. atempo->origin[0] = prev->position[0] + atempo->window / 2;
  279. atempo->origin[1] = prev->position[1] + atempo->window / 2;
  280. atempo->tempo = tempo;
  281. return 0;
  282. }
  283. /**
  284. * A helper macro for initializing complex data buffer with scalar data
  285. * of a given type.
  286. */
  287. #define yae_init_xdat(scalar_type, scalar_max) \
  288. do { \
  289. const uint8_t *src_end = src + \
  290. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  291. \
  292. FFTSample *xdat = frag->xdat; \
  293. scalar_type tmp; \
  294. \
  295. if (atempo->channels == 1) { \
  296. for (; src < src_end; xdat++) { \
  297. tmp = *(const scalar_type *)src; \
  298. src += sizeof(scalar_type); \
  299. \
  300. *xdat = (FFTSample)tmp; \
  301. } \
  302. } else { \
  303. FFTSample s, max, ti, si; \
  304. int i; \
  305. \
  306. for (; src < src_end; xdat++) { \
  307. tmp = *(const scalar_type *)src; \
  308. src += sizeof(scalar_type); \
  309. \
  310. max = (FFTSample)tmp; \
  311. s = FFMIN((FFTSample)scalar_max, \
  312. (FFTSample)fabsf(max)); \
  313. \
  314. for (i = 1; i < atempo->channels; i++) { \
  315. tmp = *(const scalar_type *)src; \
  316. src += sizeof(scalar_type); \
  317. \
  318. ti = (FFTSample)tmp; \
  319. si = FFMIN((FFTSample)scalar_max, \
  320. (FFTSample)fabsf(ti)); \
  321. \
  322. if (s < si) { \
  323. s = si; \
  324. max = ti; \
  325. } \
  326. } \
  327. \
  328. *xdat = max; \
  329. } \
  330. } \
  331. } while (0)
  332. /**
  333. * Initialize complex data buffer of a given audio fragment
  334. * with down-mixed mono data of appropriate scalar type.
  335. */
  336. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  337. {
  338. // shortcuts:
  339. const uint8_t *src = frag->data;
  340. // init complex data buffer used for FFT and Correlation:
  341. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  342. if (atempo->format == AV_SAMPLE_FMT_U8) {
  343. yae_init_xdat(uint8_t, 127);
  344. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  345. yae_init_xdat(int16_t, 32767);
  346. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  347. yae_init_xdat(int, 2147483647);
  348. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  349. yae_init_xdat(float, 1);
  350. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  351. yae_init_xdat(double, 1);
  352. }
  353. }
  354. /**
  355. * Populate the internal data buffer on as-needed basis.
  356. *
  357. * @return
  358. * 0 if requested data was already available or was successfully loaded,
  359. * AVERROR(EAGAIN) if more input data is required.
  360. */
  361. static int yae_load_data(ATempoContext *atempo,
  362. const uint8_t **src_ref,
  363. const uint8_t *src_end,
  364. int64_t stop_here)
  365. {
  366. // shortcut:
  367. const uint8_t *src = *src_ref;
  368. const int read_size = stop_here - atempo->position[0];
  369. if (stop_here <= atempo->position[0]) {
  370. return 0;
  371. }
  372. // samples are not expected to be skipped:
  373. av_assert0(read_size <= atempo->ring);
  374. while (atempo->position[0] < stop_here && src < src_end) {
  375. int src_samples = (src_end - src) / atempo->stride;
  376. // load data piece-wise, in order to avoid complicating the logic:
  377. int nsamples = FFMIN(read_size, src_samples);
  378. int na;
  379. int nb;
  380. nsamples = FFMIN(nsamples, atempo->ring);
  381. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  382. nb = FFMIN(nsamples - na, atempo->ring);
  383. if (na) {
  384. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  385. memcpy(a, src, na * atempo->stride);
  386. src += na * atempo->stride;
  387. atempo->position[0] += na;
  388. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  389. atempo->tail = (atempo->tail + na) % atempo->ring;
  390. atempo->head =
  391. atempo->size < atempo->ring ?
  392. atempo->tail - atempo->size :
  393. atempo->tail;
  394. }
  395. if (nb) {
  396. uint8_t *b = atempo->buffer;
  397. memcpy(b, src, nb * atempo->stride);
  398. src += nb * atempo->stride;
  399. atempo->position[0] += nb;
  400. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  401. atempo->tail = (atempo->tail + nb) % atempo->ring;
  402. atempo->head =
  403. atempo->size < atempo->ring ?
  404. atempo->tail - atempo->size :
  405. atempo->tail;
  406. }
  407. }
  408. // pass back the updated source buffer pointer:
  409. *src_ref = src;
  410. // sanity check:
  411. av_assert0(atempo->position[0] <= stop_here);
  412. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  413. }
  414. /**
  415. * Populate current audio fragment data buffer.
  416. *
  417. * @return
  418. * 0 when the fragment is ready,
  419. * AVERROR(EAGAIN) if more input data is required.
  420. */
  421. static int yae_load_frag(ATempoContext *atempo,
  422. const uint8_t **src_ref,
  423. const uint8_t *src_end)
  424. {
  425. // shortcuts:
  426. AudioFragment *frag = yae_curr_frag(atempo);
  427. uint8_t *dst;
  428. int64_t missing, start, zeros;
  429. uint32_t nsamples;
  430. const uint8_t *a, *b;
  431. int i0, i1, n0, n1, na, nb;
  432. int64_t stop_here = frag->position[0] + atempo->window;
  433. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  434. return AVERROR(EAGAIN);
  435. }
  436. // calculate the number of samples we don't have:
  437. missing =
  438. stop_here > atempo->position[0] ?
  439. stop_here - atempo->position[0] : 0;
  440. nsamples =
  441. missing < (int64_t)atempo->window ?
  442. (uint32_t)(atempo->window - missing) : 0;
  443. // setup the output buffer:
  444. frag->nsamples = nsamples;
  445. dst = frag->data;
  446. start = atempo->position[0] - atempo->size;
  447. zeros = 0;
  448. if (frag->position[0] < start) {
  449. // what we don't have we substitute with zeros:
  450. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  451. av_assert0(zeros != nsamples);
  452. memset(dst, 0, zeros * atempo->stride);
  453. dst += zeros * atempo->stride;
  454. }
  455. if (zeros == nsamples) {
  456. return 0;
  457. }
  458. // get the remaining data from the ring buffer:
  459. na = (atempo->head < atempo->tail ?
  460. atempo->tail - atempo->head :
  461. atempo->ring - atempo->head);
  462. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  463. // sanity check:
  464. av_assert0(nsamples <= zeros + na + nb);
  465. a = atempo->buffer + atempo->head * atempo->stride;
  466. b = atempo->buffer;
  467. i0 = frag->position[0] + zeros - start;
  468. i1 = i0 < na ? 0 : i0 - na;
  469. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  470. n1 = nsamples - zeros - n0;
  471. if (n0) {
  472. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  473. dst += n0 * atempo->stride;
  474. }
  475. if (n1) {
  476. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  477. }
  478. return 0;
  479. }
  480. /**
  481. * Prepare for loading next audio fragment.
  482. */
  483. static void yae_advance_to_next_frag(ATempoContext *atempo)
  484. {
  485. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  486. const AudioFragment *prev;
  487. AudioFragment *frag;
  488. atempo->nfrag++;
  489. prev = yae_prev_frag(atempo);
  490. frag = yae_curr_frag(atempo);
  491. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  492. frag->position[1] = prev->position[1] + atempo->window / 2;
  493. frag->nsamples = 0;
  494. }
  495. /**
  496. * Calculate cross-correlation via rDFT.
  497. *
  498. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  499. * and transform back via complex_to_real rDFT.
  500. */
  501. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  502. RDFTContext *complex_to_real,
  503. const FFTComplex *xa,
  504. const FFTComplex *xb,
  505. const int window)
  506. {
  507. FFTComplex *xc = (FFTComplex *)xcorr;
  508. int i;
  509. // NOTE: first element requires special care -- Given Y = rDFT(X),
  510. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  511. // stores Re(Y[N/2]) in place of Im(Y[0]).
  512. xc->re = xa->re * xb->re;
  513. xc->im = xa->im * xb->im;
  514. xa++;
  515. xb++;
  516. xc++;
  517. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  518. xc->re = (xa->re * xb->re + xa->im * xb->im);
  519. xc->im = (xa->im * xb->re - xa->re * xb->im);
  520. }
  521. // apply inverse rDFT:
  522. av_rdft_calc(complex_to_real, xcorr);
  523. }
  524. /**
  525. * Calculate alignment offset for given fragment
  526. * relative to the previous fragment.
  527. *
  528. * @return alignment offset of current fragment relative to previous.
  529. */
  530. static int yae_align(AudioFragment *frag,
  531. const AudioFragment *prev,
  532. const int window,
  533. const int delta_max,
  534. const int drift,
  535. FFTSample *correlation,
  536. RDFTContext *complex_to_real)
  537. {
  538. int best_offset = -drift;
  539. FFTSample best_metric = -FLT_MAX;
  540. FFTSample *xcorr;
  541. int i0;
  542. int i1;
  543. int i;
  544. yae_xcorr_via_rdft(correlation,
  545. complex_to_real,
  546. (const FFTComplex *)prev->xdat,
  547. (const FFTComplex *)frag->xdat,
  548. window);
  549. // identify search window boundaries:
  550. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  551. i0 = FFMIN(i0, window);
  552. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  553. i1 = FFMAX(i1, 0);
  554. // identify cross-correlation peaks within search window:
  555. xcorr = correlation + i0;
  556. for (i = i0; i < i1; i++, xcorr++) {
  557. FFTSample metric = *xcorr;
  558. // normalize:
  559. FFTSample drifti = (FFTSample)(drift + i);
  560. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  561. if (metric > best_metric) {
  562. best_metric = metric;
  563. best_offset = i - window / 2;
  564. }
  565. }
  566. return best_offset;
  567. }
  568. /**
  569. * Adjust current fragment position for better alignment
  570. * with previous fragment.
  571. *
  572. * @return alignment correction.
  573. */
  574. static int yae_adjust_position(ATempoContext *atempo)
  575. {
  576. const AudioFragment *prev = yae_prev_frag(atempo);
  577. AudioFragment *frag = yae_curr_frag(atempo);
  578. const double prev_output_position =
  579. (double)(prev->position[1] - atempo->origin[1] + atempo->window / 2);
  580. const double ideal_output_position =
  581. (double)(prev->position[0] - atempo->origin[0] + atempo->window / 2) /
  582. atempo->tempo;
  583. const int drift = (int)(prev_output_position - ideal_output_position);
  584. const int delta_max = atempo->window / 2;
  585. const int correction = yae_align(frag,
  586. prev,
  587. atempo->window,
  588. delta_max,
  589. drift,
  590. atempo->correlation,
  591. atempo->complex_to_real);
  592. if (correction) {
  593. // adjust fragment position:
  594. frag->position[0] -= correction;
  595. // clear so that the fragment can be reloaded:
  596. frag->nsamples = 0;
  597. }
  598. return correction;
  599. }
  600. /**
  601. * A helper macro for blending the overlap region of previous
  602. * and current audio fragment.
  603. */
  604. #define yae_blend(scalar_type) \
  605. do { \
  606. const scalar_type *aaa = (const scalar_type *)a; \
  607. const scalar_type *bbb = (const scalar_type *)b; \
  608. \
  609. scalar_type *out = (scalar_type *)dst; \
  610. scalar_type *out_end = (scalar_type *)dst_end; \
  611. int64_t i; \
  612. \
  613. for (i = 0; i < overlap && out < out_end; \
  614. i++, atempo->position[1]++, wa++, wb++) { \
  615. float w0 = *wa; \
  616. float w1 = *wb; \
  617. int j; \
  618. \
  619. for (j = 0; j < atempo->channels; \
  620. j++, aaa++, bbb++, out++) { \
  621. float t0 = (float)*aaa; \
  622. float t1 = (float)*bbb; \
  623. \
  624. *out = \
  625. frag->position[0] + i < 0 ? \
  626. *aaa : \
  627. (scalar_type)(t0 * w0 + t1 * w1); \
  628. } \
  629. } \
  630. dst = (uint8_t *)out; \
  631. } while (0)
  632. /**
  633. * Blend the overlap region of previous and current audio fragment
  634. * and output the results to the given destination buffer.
  635. *
  636. * @return
  637. * 0 if the overlap region was completely stored in the dst buffer,
  638. * AVERROR(EAGAIN) if more destination buffer space is required.
  639. */
  640. static int yae_overlap_add(ATempoContext *atempo,
  641. uint8_t **dst_ref,
  642. uint8_t *dst_end)
  643. {
  644. // shortcuts:
  645. const AudioFragment *prev = yae_prev_frag(atempo);
  646. const AudioFragment *frag = yae_curr_frag(atempo);
  647. const int64_t start_here = FFMAX(atempo->position[1],
  648. frag->position[1]);
  649. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  650. frag->position[1] + frag->nsamples);
  651. const int64_t overlap = stop_here - start_here;
  652. const int64_t ia = start_here - prev->position[1];
  653. const int64_t ib = start_here - frag->position[1];
  654. const float *wa = atempo->hann + ia;
  655. const float *wb = atempo->hann + ib;
  656. const uint8_t *a = prev->data + ia * atempo->stride;
  657. const uint8_t *b = frag->data + ib * atempo->stride;
  658. uint8_t *dst = *dst_ref;
  659. av_assert0(start_here <= stop_here &&
  660. frag->position[1] <= start_here &&
  661. overlap <= frag->nsamples);
  662. if (atempo->format == AV_SAMPLE_FMT_U8) {
  663. yae_blend(uint8_t);
  664. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  665. yae_blend(int16_t);
  666. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  667. yae_blend(int);
  668. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  669. yae_blend(float);
  670. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  671. yae_blend(double);
  672. }
  673. // pass-back the updated destination buffer pointer:
  674. *dst_ref = dst;
  675. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  676. }
  677. /**
  678. * Feed as much data to the filter as it is able to consume
  679. * and receive as much processed data in the destination buffer
  680. * as it is able to produce or store.
  681. */
  682. static void
  683. yae_apply(ATempoContext *atempo,
  684. const uint8_t **src_ref,
  685. const uint8_t *src_end,
  686. uint8_t **dst_ref,
  687. uint8_t *dst_end)
  688. {
  689. while (1) {
  690. if (atempo->state == YAE_LOAD_FRAGMENT) {
  691. // load additional data for the current fragment:
  692. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  693. break;
  694. }
  695. // down-mix to mono:
  696. yae_downmix(atempo, yae_curr_frag(atempo));
  697. // apply rDFT:
  698. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  699. // must load the second fragment before alignment can start:
  700. if (!atempo->nfrag) {
  701. yae_advance_to_next_frag(atempo);
  702. continue;
  703. }
  704. atempo->state = YAE_ADJUST_POSITION;
  705. }
  706. if (atempo->state == YAE_ADJUST_POSITION) {
  707. // adjust position for better alignment:
  708. if (yae_adjust_position(atempo)) {
  709. // reload the fragment at the corrected position, so that the
  710. // Hann window blending would not require normalization:
  711. atempo->state = YAE_RELOAD_FRAGMENT;
  712. } else {
  713. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  714. }
  715. }
  716. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  717. // load additional data if necessary due to position adjustment:
  718. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  719. break;
  720. }
  721. // down-mix to mono:
  722. yae_downmix(atempo, yae_curr_frag(atempo));
  723. // apply rDFT:
  724. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  725. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  726. }
  727. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  728. // overlap-add and output the result:
  729. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  730. break;
  731. }
  732. // advance to the next fragment, repeat:
  733. yae_advance_to_next_frag(atempo);
  734. atempo->state = YAE_LOAD_FRAGMENT;
  735. }
  736. }
  737. }
  738. /**
  739. * Flush any buffered data from the filter.
  740. *
  741. * @return
  742. * 0 if all data was completely stored in the dst buffer,
  743. * AVERROR(EAGAIN) if more destination buffer space is required.
  744. */
  745. static int yae_flush(ATempoContext *atempo,
  746. uint8_t **dst_ref,
  747. uint8_t *dst_end)
  748. {
  749. AudioFragment *frag = yae_curr_frag(atempo);
  750. int64_t overlap_end;
  751. int64_t start_here;
  752. int64_t stop_here;
  753. int64_t offset;
  754. const uint8_t *src;
  755. uint8_t *dst;
  756. int src_size;
  757. int dst_size;
  758. int nbytes;
  759. atempo->state = YAE_FLUSH_OUTPUT;
  760. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  761. atempo->position[1] == frag->position[1] + frag->nsamples) {
  762. // the current fragment is already flushed:
  763. return 0;
  764. }
  765. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  766. // finish loading the current (possibly partial) fragment:
  767. yae_load_frag(atempo, NULL, NULL);
  768. if (atempo->nfrag) {
  769. // down-mix to mono:
  770. yae_downmix(atempo, frag);
  771. // apply rDFT:
  772. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  773. // align current fragment to previous fragment:
  774. if (yae_adjust_position(atempo)) {
  775. // reload the current fragment due to adjusted position:
  776. yae_load_frag(atempo, NULL, NULL);
  777. }
  778. }
  779. }
  780. // flush the overlap region:
  781. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  782. frag->nsamples);
  783. while (atempo->position[1] < overlap_end) {
  784. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  785. return AVERROR(EAGAIN);
  786. }
  787. }
  788. // check whether all of the input samples have been consumed:
  789. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  790. yae_advance_to_next_frag(atempo);
  791. return AVERROR(EAGAIN);
  792. }
  793. // flush the remainder of the current fragment:
  794. start_here = FFMAX(atempo->position[1], overlap_end);
  795. stop_here = frag->position[1] + frag->nsamples;
  796. offset = start_here - frag->position[1];
  797. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  798. src = frag->data + offset * atempo->stride;
  799. dst = (uint8_t *)*dst_ref;
  800. src_size = (int)(stop_here - start_here) * atempo->stride;
  801. dst_size = dst_end - dst;
  802. nbytes = FFMIN(src_size, dst_size);
  803. memcpy(dst, src, nbytes);
  804. dst += nbytes;
  805. atempo->position[1] += (nbytes / atempo->stride);
  806. // pass-back the updated destination buffer pointer:
  807. *dst_ref = (uint8_t *)dst;
  808. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  809. }
  810. static av_cold int init(AVFilterContext *ctx)
  811. {
  812. ATempoContext *atempo = ctx->priv;
  813. atempo->format = AV_SAMPLE_FMT_NONE;
  814. atempo->state = YAE_LOAD_FRAGMENT;
  815. return 0;
  816. }
  817. static av_cold void uninit(AVFilterContext *ctx)
  818. {
  819. ATempoContext *atempo = ctx->priv;
  820. yae_release_buffers(atempo);
  821. }
  822. static int query_formats(AVFilterContext *ctx)
  823. {
  824. AVFilterChannelLayouts *layouts = NULL;
  825. AVFilterFormats *formats = NULL;
  826. // WSOLA necessitates an internal sliding window ring buffer
  827. // for incoming audio stream.
  828. //
  829. // Planar sample formats are too cumbersome to store in a ring buffer,
  830. // therefore planar sample formats are not supported.
  831. //
  832. static const enum AVSampleFormat sample_fmts[] = {
  833. AV_SAMPLE_FMT_U8,
  834. AV_SAMPLE_FMT_S16,
  835. AV_SAMPLE_FMT_S32,
  836. AV_SAMPLE_FMT_FLT,
  837. AV_SAMPLE_FMT_DBL,
  838. AV_SAMPLE_FMT_NONE
  839. };
  840. int ret;
  841. layouts = ff_all_channel_layouts();
  842. if (!layouts) {
  843. return AVERROR(ENOMEM);
  844. }
  845. ret = ff_set_common_channel_layouts(ctx, layouts);
  846. if (ret < 0)
  847. return ret;
  848. formats = ff_make_format_list(sample_fmts);
  849. if (!formats) {
  850. return AVERROR(ENOMEM);
  851. }
  852. ret = ff_set_common_formats(ctx, formats);
  853. if (ret < 0)
  854. return ret;
  855. formats = ff_all_samplerates();
  856. if (!formats) {
  857. return AVERROR(ENOMEM);
  858. }
  859. return ff_set_common_samplerates(ctx, formats);
  860. }
  861. static int config_props(AVFilterLink *inlink)
  862. {
  863. AVFilterContext *ctx = inlink->dst;
  864. ATempoContext *atempo = ctx->priv;
  865. enum AVSampleFormat format = inlink->format;
  866. int sample_rate = (int)inlink->sample_rate;
  867. int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
  868. return yae_reset(atempo, format, sample_rate, channels);
  869. }
  870. static int push_samples(ATempoContext *atempo,
  871. AVFilterLink *outlink,
  872. int n_out)
  873. {
  874. int ret;
  875. atempo->dst_buffer->sample_rate = outlink->sample_rate;
  876. atempo->dst_buffer->nb_samples = n_out;
  877. // adjust the PTS:
  878. atempo->dst_buffer->pts =
  879. av_rescale_q(atempo->nsamples_out,
  880. (AVRational){ 1, outlink->sample_rate },
  881. outlink->time_base);
  882. ret = ff_filter_frame(outlink, atempo->dst_buffer);
  883. atempo->dst_buffer = NULL;
  884. atempo->dst = NULL;
  885. atempo->dst_end = NULL;
  886. if (ret < 0)
  887. return ret;
  888. atempo->nsamples_out += n_out;
  889. return 0;
  890. }
  891. static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer)
  892. {
  893. AVFilterContext *ctx = inlink->dst;
  894. ATempoContext *atempo = ctx->priv;
  895. AVFilterLink *outlink = ctx->outputs[0];
  896. int ret = 0;
  897. int n_in = src_buffer->nb_samples;
  898. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  899. const uint8_t *src = src_buffer->data[0];
  900. const uint8_t *src_end = src + n_in * atempo->stride;
  901. while (src < src_end) {
  902. if (!atempo->dst_buffer) {
  903. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out);
  904. if (!atempo->dst_buffer)
  905. return AVERROR(ENOMEM);
  906. av_frame_copy_props(atempo->dst_buffer, src_buffer);
  907. atempo->dst = atempo->dst_buffer->data[0];
  908. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  909. }
  910. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  911. if (atempo->dst == atempo->dst_end) {
  912. int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) /
  913. atempo->stride);
  914. ret = push_samples(atempo, outlink, n_samples);
  915. if (ret < 0)
  916. goto end;
  917. }
  918. }
  919. atempo->nsamples_in += n_in;
  920. end:
  921. av_frame_free(&src_buffer);
  922. return ret;
  923. }
  924. static int request_frame(AVFilterLink *outlink)
  925. {
  926. AVFilterContext *ctx = outlink->src;
  927. ATempoContext *atempo = ctx->priv;
  928. int ret;
  929. ret = ff_request_frame(ctx->inputs[0]);
  930. if (ret == AVERROR_EOF) {
  931. // flush the filter:
  932. int n_max = atempo->ring;
  933. int n_out;
  934. int err = AVERROR(EAGAIN);
  935. while (err == AVERROR(EAGAIN)) {
  936. if (!atempo->dst_buffer) {
  937. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max);
  938. if (!atempo->dst_buffer)
  939. return AVERROR(ENOMEM);
  940. atempo->dst = atempo->dst_buffer->data[0];
  941. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  942. }
  943. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  944. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  945. atempo->stride);
  946. if (n_out) {
  947. ret = push_samples(atempo, outlink, n_out);
  948. }
  949. }
  950. av_frame_free(&atempo->dst_buffer);
  951. atempo->dst = NULL;
  952. atempo->dst_end = NULL;
  953. return AVERROR_EOF;
  954. }
  955. return ret;
  956. }
  957. static int process_command(AVFilterContext *ctx,
  958. const char *cmd,
  959. const char *arg,
  960. char *res,
  961. int res_len,
  962. int flags)
  963. {
  964. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  965. }
  966. static const AVFilterPad atempo_inputs[] = {
  967. {
  968. .name = "default",
  969. .type = AVMEDIA_TYPE_AUDIO,
  970. .filter_frame = filter_frame,
  971. .config_props = config_props,
  972. },
  973. { NULL }
  974. };
  975. static const AVFilterPad atempo_outputs[] = {
  976. {
  977. .name = "default",
  978. .request_frame = request_frame,
  979. .type = AVMEDIA_TYPE_AUDIO,
  980. },
  981. { NULL }
  982. };
  983. AVFilter ff_af_atempo = {
  984. .name = "atempo",
  985. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  986. .init = init,
  987. .uninit = uninit,
  988. .query_formats = query_formats,
  989. .process_command = process_command,
  990. .priv_size = sizeof(ATempoContext),
  991. .priv_class = &atempo_class,
  992. .inputs = atempo_inputs,
  993. .outputs = atempo_outputs,
  994. };