af_atempo.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163
  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/audioconvert.h"
  42. #include "libavutil/avassert.h"
  43. #include "libavutil/avstring.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct {
  80. // ring-buffer of input samples, necessary because some times
  81. // input fragment position may be adjusted backwards:
  82. uint8_t *buffer;
  83. // ring-buffer maximum capacity, expressed in sample rate time base:
  84. int ring;
  85. // ring-buffer house keeping:
  86. int size;
  87. int head;
  88. int tail;
  89. // 0: input sample position corresponding to the ring buffer tail
  90. // 1: output sample position
  91. int64_t position[2];
  92. // sample format:
  93. enum AVSampleFormat format;
  94. // number of channels:
  95. int channels;
  96. // row of bytes to skip from one sample to next, across multple channels;
  97. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  98. int stride;
  99. // fragment window size, power-of-two integer:
  100. int window;
  101. // Hann window coefficients, for feathering
  102. // (blending) the overlapping fragment region:
  103. float *hann;
  104. // tempo scaling factor:
  105. double tempo;
  106. // cumulative alignment drift:
  107. int drift;
  108. // current/previous fragment ring-buffer:
  109. AudioFragment frag[2];
  110. // current fragment index:
  111. uint64_t nfrag;
  112. // current state:
  113. FilterState state;
  114. // for fast correlation calculation in frequency domain:
  115. RDFTContext *real_to_complex;
  116. RDFTContext *complex_to_real;
  117. FFTSample *correlation;
  118. // for managing AVFilterPad.request_frame and AVFilterPad.filter_samples
  119. int request_fulfilled;
  120. AVFilterBufferRef *dst_buffer;
  121. uint8_t *dst;
  122. uint8_t *dst_end;
  123. uint64_t nsamples_in;
  124. uint64_t nsamples_out;
  125. } ATempoContext;
  126. /**
  127. * Reset filter to initial state, do not deallocate existing local buffers.
  128. */
  129. static void yae_clear(ATempoContext *atempo)
  130. {
  131. atempo->size = 0;
  132. atempo->head = 0;
  133. atempo->tail = 0;
  134. atempo->drift = 0;
  135. atempo->nfrag = 0;
  136. atempo->state = YAE_LOAD_FRAGMENT;
  137. atempo->position[0] = 0;
  138. atempo->position[1] = 0;
  139. atempo->frag[0].position[0] = 0;
  140. atempo->frag[0].position[1] = 0;
  141. atempo->frag[0].nsamples = 0;
  142. atempo->frag[1].position[0] = 0;
  143. atempo->frag[1].position[1] = 0;
  144. atempo->frag[1].nsamples = 0;
  145. // shift left position of 1st fragment by half a window
  146. // so that no re-normalization would be required for
  147. // the left half of the 1st fragment:
  148. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  149. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  150. avfilter_unref_bufferp(&atempo->dst_buffer);
  151. atempo->dst = NULL;
  152. atempo->dst_end = NULL;
  153. atempo->request_fulfilled = 0;
  154. atempo->nsamples_in = 0;
  155. atempo->nsamples_out = 0;
  156. }
  157. /**
  158. * Reset filter to initial state and deallocate all buffers.
  159. */
  160. static void yae_release_buffers(ATempoContext *atempo)
  161. {
  162. yae_clear(atempo);
  163. av_freep(&atempo->frag[0].data);
  164. av_freep(&atempo->frag[1].data);
  165. av_freep(&atempo->frag[0].xdat);
  166. av_freep(&atempo->frag[1].xdat);
  167. av_freep(&atempo->buffer);
  168. av_freep(&atempo->hann);
  169. av_freep(&atempo->correlation);
  170. av_rdft_end(atempo->real_to_complex);
  171. atempo->real_to_complex = NULL;
  172. av_rdft_end(atempo->complex_to_real);
  173. atempo->complex_to_real = NULL;
  174. }
  175. /* av_realloc is not aligned enough; fortunately, the data does not need to
  176. * be preserved */
  177. #define RE_MALLOC_OR_FAIL(field, field_size) \
  178. do { \
  179. av_freep(&field); \
  180. field = av_malloc(field_size); \
  181. if (!field) { \
  182. yae_release_buffers(atempo); \
  183. return AVERROR(ENOMEM); \
  184. } \
  185. } while (0)
  186. /**
  187. * Prepare filter for processing audio data of given format,
  188. * sample rate and number of channels.
  189. */
  190. static int yae_reset(ATempoContext *atempo,
  191. enum AVSampleFormat format,
  192. int sample_rate,
  193. int channels)
  194. {
  195. const int sample_size = av_get_bytes_per_sample(format);
  196. uint32_t nlevels = 0;
  197. uint32_t pot;
  198. int i;
  199. atempo->format = format;
  200. atempo->channels = channels;
  201. atempo->stride = sample_size * channels;
  202. // pick a segment window size:
  203. atempo->window = sample_rate / 24;
  204. // adjust window size to be a power-of-two integer:
  205. nlevels = av_log2(atempo->window);
  206. pot = 1 << nlevels;
  207. av_assert0(pot <= atempo->window);
  208. if (pot < atempo->window) {
  209. atempo->window = pot * 2;
  210. nlevels++;
  211. }
  212. // initialize audio fragment buffers:
  213. RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  214. RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  215. RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  216. RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  217. // initialize rDFT contexts:
  218. av_rdft_end(atempo->real_to_complex);
  219. atempo->real_to_complex = NULL;
  220. av_rdft_end(atempo->complex_to_real);
  221. atempo->complex_to_real = NULL;
  222. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  223. if (!atempo->real_to_complex) {
  224. yae_release_buffers(atempo);
  225. return AVERROR(ENOMEM);
  226. }
  227. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  228. if (!atempo->complex_to_real) {
  229. yae_release_buffers(atempo);
  230. return AVERROR(ENOMEM);
  231. }
  232. RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  233. atempo->ring = atempo->window * 3;
  234. RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  235. // initialize the Hann window function:
  236. RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  237. for (i = 0; i < atempo->window; i++) {
  238. double t = (double)i / (double)(atempo->window - 1);
  239. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  240. atempo->hann[i] = (float)h;
  241. }
  242. yae_clear(atempo);
  243. return 0;
  244. }
  245. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  246. {
  247. ATempoContext *atempo = ctx->priv;
  248. char *tail = NULL;
  249. double tempo = av_strtod(arg_tempo, &tail);
  250. if (tail && *tail) {
  251. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  252. return AVERROR(EINVAL);
  253. }
  254. if (tempo < 0.5 || tempo > 2.0) {
  255. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
  256. tempo);
  257. return AVERROR(EINVAL);
  258. }
  259. atempo->tempo = tempo;
  260. return 0;
  261. }
  262. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  263. {
  264. return &atempo->frag[atempo->nfrag % 2];
  265. }
  266. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  267. {
  268. return &atempo->frag[(atempo->nfrag + 1) % 2];
  269. }
  270. /**
  271. * A helper macro for initializing complex data buffer with scalar data
  272. * of a given type.
  273. */
  274. #define yae_init_xdat(scalar_type, scalar_max) \
  275. do { \
  276. const uint8_t *src_end = src + \
  277. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  278. \
  279. FFTSample *xdat = frag->xdat; \
  280. scalar_type tmp; \
  281. \
  282. if (atempo->channels == 1) { \
  283. for (; src < src_end; xdat++) { \
  284. tmp = *(const scalar_type *)src; \
  285. src += sizeof(scalar_type); \
  286. \
  287. *xdat = (FFTSample)tmp; \
  288. } \
  289. } else { \
  290. FFTSample s, max, ti, si; \
  291. int i; \
  292. \
  293. for (; src < src_end; xdat++) { \
  294. tmp = *(const scalar_type *)src; \
  295. src += sizeof(scalar_type); \
  296. \
  297. max = (FFTSample)tmp; \
  298. s = FFMIN((FFTSample)scalar_max, \
  299. (FFTSample)fabsf(max)); \
  300. \
  301. for (i = 1; i < atempo->channels; i++) { \
  302. tmp = *(const scalar_type *)src; \
  303. src += sizeof(scalar_type); \
  304. \
  305. ti = (FFTSample)tmp; \
  306. si = FFMIN((FFTSample)scalar_max, \
  307. (FFTSample)fabsf(ti)); \
  308. \
  309. if (s < si) { \
  310. s = si; \
  311. max = ti; \
  312. } \
  313. } \
  314. \
  315. *xdat = max; \
  316. } \
  317. } \
  318. } while (0)
  319. /**
  320. * Initialize complex data buffer of a given audio fragment
  321. * with down-mixed mono data of appropriate scalar type.
  322. */
  323. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  324. {
  325. // shortcuts:
  326. const uint8_t *src = frag->data;
  327. // init complex data buffer used for FFT and Correlation:
  328. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  329. if (atempo->format == AV_SAMPLE_FMT_U8) {
  330. yae_init_xdat(uint8_t, 127);
  331. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  332. yae_init_xdat(int16_t, 32767);
  333. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  334. yae_init_xdat(int, 2147483647);
  335. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  336. yae_init_xdat(float, 1);
  337. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  338. yae_init_xdat(double, 1);
  339. }
  340. }
  341. /**
  342. * Populate the internal data buffer on as-needed basis.
  343. *
  344. * @return
  345. * 0 if requested data was already available or was successfully loaded,
  346. * AVERROR(EAGAIN) if more input data is required.
  347. */
  348. static int yae_load_data(ATempoContext *atempo,
  349. const uint8_t **src_ref,
  350. const uint8_t *src_end,
  351. int64_t stop_here)
  352. {
  353. // shortcut:
  354. const uint8_t *src = *src_ref;
  355. const int read_size = stop_here - atempo->position[0];
  356. if (stop_here <= atempo->position[0]) {
  357. return 0;
  358. }
  359. // samples are not expected to be skipped:
  360. av_assert0(read_size <= atempo->ring);
  361. while (atempo->position[0] < stop_here && src < src_end) {
  362. int src_samples = (src_end - src) / atempo->stride;
  363. // load data piece-wise, in order to avoid complicating the logic:
  364. int nsamples = FFMIN(read_size, src_samples);
  365. int na;
  366. int nb;
  367. nsamples = FFMIN(nsamples, atempo->ring);
  368. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  369. nb = FFMIN(nsamples - na, atempo->ring);
  370. if (na) {
  371. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  372. memcpy(a, src, na * atempo->stride);
  373. src += na * atempo->stride;
  374. atempo->position[0] += na;
  375. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  376. atempo->tail = (atempo->tail + na) % atempo->ring;
  377. atempo->head =
  378. atempo->size < atempo->ring ?
  379. atempo->tail - atempo->size :
  380. atempo->tail;
  381. }
  382. if (nb) {
  383. uint8_t *b = atempo->buffer;
  384. memcpy(b, src, nb * atempo->stride);
  385. src += nb * atempo->stride;
  386. atempo->position[0] += nb;
  387. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  388. atempo->tail = (atempo->tail + nb) % atempo->ring;
  389. atempo->head =
  390. atempo->size < atempo->ring ?
  391. atempo->tail - atempo->size :
  392. atempo->tail;
  393. }
  394. }
  395. // pass back the updated source buffer pointer:
  396. *src_ref = src;
  397. // sanity check:
  398. av_assert0(atempo->position[0] <= stop_here);
  399. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  400. }
  401. /**
  402. * Populate current audio fragment data buffer.
  403. *
  404. * @return
  405. * 0 when the fragment is ready,
  406. * AVERROR(EAGAIN) if more input data is required.
  407. */
  408. static int yae_load_frag(ATempoContext *atempo,
  409. const uint8_t **src_ref,
  410. const uint8_t *src_end)
  411. {
  412. // shortcuts:
  413. AudioFragment *frag = yae_curr_frag(atempo);
  414. uint8_t *dst;
  415. int64_t missing, start, zeros;
  416. uint32_t nsamples;
  417. const uint8_t *a, *b;
  418. int i0, i1, n0, n1, na, nb;
  419. int64_t stop_here = frag->position[0] + atempo->window;
  420. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  421. return AVERROR(EAGAIN);
  422. }
  423. // calculate the number of samples we don't have:
  424. missing =
  425. stop_here > atempo->position[0] ?
  426. stop_here - atempo->position[0] : 0;
  427. nsamples =
  428. missing < (int64_t)atempo->window ?
  429. (uint32_t)(atempo->window - missing) : 0;
  430. // setup the output buffer:
  431. frag->nsamples = nsamples;
  432. dst = frag->data;
  433. start = atempo->position[0] - atempo->size;
  434. zeros = 0;
  435. if (frag->position[0] < start) {
  436. // what we don't have we substitute with zeros:
  437. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  438. av_assert0(zeros != nsamples);
  439. memset(dst, 0, zeros * atempo->stride);
  440. dst += zeros * atempo->stride;
  441. }
  442. if (zeros == nsamples) {
  443. return 0;
  444. }
  445. // get the remaining data from the ring buffer:
  446. na = (atempo->head < atempo->tail ?
  447. atempo->tail - atempo->head :
  448. atempo->ring - atempo->head);
  449. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  450. // sanity check:
  451. av_assert0(nsamples <= zeros + na + nb);
  452. a = atempo->buffer + atempo->head * atempo->stride;
  453. b = atempo->buffer;
  454. i0 = frag->position[0] + zeros - start;
  455. i1 = i0 < na ? 0 : i0 - na;
  456. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  457. n1 = nsamples - zeros - n0;
  458. if (n0) {
  459. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  460. dst += n0 * atempo->stride;
  461. }
  462. if (n1) {
  463. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  464. }
  465. return 0;
  466. }
  467. /**
  468. * Prepare for loading next audio fragment.
  469. */
  470. static void yae_advance_to_next_frag(ATempoContext *atempo)
  471. {
  472. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  473. const AudioFragment *prev;
  474. AudioFragment *frag;
  475. atempo->nfrag++;
  476. prev = yae_prev_frag(atempo);
  477. frag = yae_curr_frag(atempo);
  478. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  479. frag->position[1] = prev->position[1] + atempo->window / 2;
  480. frag->nsamples = 0;
  481. }
  482. /**
  483. * Calculate cross-correlation via rDFT.
  484. *
  485. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  486. * and transform back via complex_to_real rDFT.
  487. */
  488. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  489. RDFTContext *complex_to_real,
  490. const FFTComplex *xa,
  491. const FFTComplex *xb,
  492. const int window)
  493. {
  494. FFTComplex *xc = (FFTComplex *)xcorr;
  495. int i;
  496. // NOTE: first element requires special care -- Given Y = rDFT(X),
  497. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  498. // stores Re(Y[N/2]) in place of Im(Y[0]).
  499. xc->re = xa->re * xb->re;
  500. xc->im = xa->im * xb->im;
  501. xa++;
  502. xb++;
  503. xc++;
  504. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  505. xc->re = (xa->re * xb->re + xa->im * xb->im);
  506. xc->im = (xa->im * xb->re - xa->re * xb->im);
  507. }
  508. // apply inverse rDFT:
  509. av_rdft_calc(complex_to_real, xcorr);
  510. }
  511. /**
  512. * Calculate alignment offset for given fragment
  513. * relative to the previous fragment.
  514. *
  515. * @return alignment offset of current fragment relative to previous.
  516. */
  517. static int yae_align(AudioFragment *frag,
  518. const AudioFragment *prev,
  519. const int window,
  520. const int delta_max,
  521. const int drift,
  522. FFTSample *correlation,
  523. RDFTContext *complex_to_real)
  524. {
  525. int best_offset = -drift;
  526. FFTSample best_metric = -FLT_MAX;
  527. FFTSample *xcorr;
  528. int i0;
  529. int i1;
  530. int i;
  531. yae_xcorr_via_rdft(correlation,
  532. complex_to_real,
  533. (const FFTComplex *)prev->xdat,
  534. (const FFTComplex *)frag->xdat,
  535. window);
  536. // identify search window boundaries:
  537. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  538. i0 = FFMIN(i0, window);
  539. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  540. i1 = FFMAX(i1, 0);
  541. // identify cross-correlation peaks within search window:
  542. xcorr = correlation + i0;
  543. for (i = i0; i < i1; i++, xcorr++) {
  544. FFTSample metric = *xcorr;
  545. // normalize:
  546. FFTSample drifti = (FFTSample)(drift + i);
  547. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  548. if (metric > best_metric) {
  549. best_metric = metric;
  550. best_offset = i - window / 2;
  551. }
  552. }
  553. return best_offset;
  554. }
  555. /**
  556. * Adjust current fragment position for better alignment
  557. * with previous fragment.
  558. *
  559. * @return alignment correction.
  560. */
  561. static int yae_adjust_position(ATempoContext *atempo)
  562. {
  563. const AudioFragment *prev = yae_prev_frag(atempo);
  564. AudioFragment *frag = yae_curr_frag(atempo);
  565. const int delta_max = atempo->window / 2;
  566. const int correction = yae_align(frag,
  567. prev,
  568. atempo->window,
  569. delta_max,
  570. atempo->drift,
  571. atempo->correlation,
  572. atempo->complex_to_real);
  573. if (correction) {
  574. // adjust fragment position:
  575. frag->position[0] -= correction;
  576. // clear so that the fragment can be reloaded:
  577. frag->nsamples = 0;
  578. // update cumulative correction drift counter:
  579. atempo->drift += correction;
  580. }
  581. return correction;
  582. }
  583. /**
  584. * A helper macro for blending the overlap region of previous
  585. * and current audio fragment.
  586. */
  587. #define yae_blend(scalar_type) \
  588. do { \
  589. const scalar_type *aaa = (const scalar_type *)a; \
  590. const scalar_type *bbb = (const scalar_type *)b; \
  591. \
  592. scalar_type *out = (scalar_type *)dst; \
  593. scalar_type *out_end = (scalar_type *)dst_end; \
  594. int64_t i; \
  595. \
  596. for (i = 0; i < overlap && out < out_end; \
  597. i++, atempo->position[1]++, wa++, wb++) { \
  598. float w0 = *wa; \
  599. float w1 = *wb; \
  600. int j; \
  601. \
  602. for (j = 0; j < atempo->channels; \
  603. j++, aaa++, bbb++, out++) { \
  604. float t0 = (float)*aaa; \
  605. float t1 = (float)*bbb; \
  606. \
  607. *out = \
  608. frag->position[0] + i < 0 ? \
  609. *aaa : \
  610. (scalar_type)(t0 * w0 + t1 * w1); \
  611. } \
  612. } \
  613. dst = (uint8_t *)out; \
  614. } while (0)
  615. /**
  616. * Blend the overlap region of previous and current audio fragment
  617. * and output the results to the given destination buffer.
  618. *
  619. * @return
  620. * 0 if the overlap region was completely stored in the dst buffer,
  621. * AVERROR(EAGAIN) if more destination buffer space is required.
  622. */
  623. static int yae_overlap_add(ATempoContext *atempo,
  624. uint8_t **dst_ref,
  625. uint8_t *dst_end)
  626. {
  627. // shortcuts:
  628. const AudioFragment *prev = yae_prev_frag(atempo);
  629. const AudioFragment *frag = yae_curr_frag(atempo);
  630. const int64_t start_here = FFMAX(atempo->position[1],
  631. frag->position[1]);
  632. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  633. frag->position[1] + frag->nsamples);
  634. const int64_t overlap = stop_here - start_here;
  635. const int64_t ia = start_here - prev->position[1];
  636. const int64_t ib = start_here - frag->position[1];
  637. const float *wa = atempo->hann + ia;
  638. const float *wb = atempo->hann + ib;
  639. const uint8_t *a = prev->data + ia * atempo->stride;
  640. const uint8_t *b = frag->data + ib * atempo->stride;
  641. uint8_t *dst = *dst_ref;
  642. av_assert0(start_here <= stop_here &&
  643. frag->position[1] <= start_here &&
  644. overlap <= frag->nsamples);
  645. if (atempo->format == AV_SAMPLE_FMT_U8) {
  646. yae_blend(uint8_t);
  647. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  648. yae_blend(int16_t);
  649. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  650. yae_blend(int);
  651. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  652. yae_blend(float);
  653. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  654. yae_blend(double);
  655. }
  656. // pass-back the updated destination buffer pointer:
  657. *dst_ref = dst;
  658. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  659. }
  660. /**
  661. * Feed as much data to the filter as it is able to consume
  662. * and receive as much processed data in the destination buffer
  663. * as it is able to produce or store.
  664. */
  665. static void
  666. yae_apply(ATempoContext *atempo,
  667. const uint8_t **src_ref,
  668. const uint8_t *src_end,
  669. uint8_t **dst_ref,
  670. uint8_t *dst_end)
  671. {
  672. while (1) {
  673. if (atempo->state == YAE_LOAD_FRAGMENT) {
  674. // load additional data for the current fragment:
  675. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  676. break;
  677. }
  678. // down-mix to mono:
  679. yae_downmix(atempo, yae_curr_frag(atempo));
  680. // apply rDFT:
  681. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  682. // must load the second fragment before alignment can start:
  683. if (!atempo->nfrag) {
  684. yae_advance_to_next_frag(atempo);
  685. continue;
  686. }
  687. atempo->state = YAE_ADJUST_POSITION;
  688. }
  689. if (atempo->state == YAE_ADJUST_POSITION) {
  690. // adjust position for better alignment:
  691. if (yae_adjust_position(atempo)) {
  692. // reload the fragment at the corrected position, so that the
  693. // Hann window blending would not require normalization:
  694. atempo->state = YAE_RELOAD_FRAGMENT;
  695. } else {
  696. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  697. }
  698. }
  699. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  700. // load additional data if necessary due to position adjustment:
  701. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  702. break;
  703. }
  704. // down-mix to mono:
  705. yae_downmix(atempo, yae_curr_frag(atempo));
  706. // apply rDFT:
  707. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  708. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  709. }
  710. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  711. // overlap-add and output the result:
  712. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  713. break;
  714. }
  715. // advance to the next fragment, repeat:
  716. yae_advance_to_next_frag(atempo);
  717. atempo->state = YAE_LOAD_FRAGMENT;
  718. }
  719. }
  720. }
  721. /**
  722. * Flush any buffered data from the filter.
  723. *
  724. * @return
  725. * 0 if all data was completely stored in the dst buffer,
  726. * AVERROR(EAGAIN) if more destination buffer space is required.
  727. */
  728. static int yae_flush(ATempoContext *atempo,
  729. uint8_t **dst_ref,
  730. uint8_t *dst_end)
  731. {
  732. AudioFragment *frag = yae_curr_frag(atempo);
  733. int64_t overlap_end;
  734. int64_t start_here;
  735. int64_t stop_here;
  736. int64_t offset;
  737. const uint8_t *src;
  738. uint8_t *dst;
  739. int src_size;
  740. int dst_size;
  741. int nbytes;
  742. atempo->state = YAE_FLUSH_OUTPUT;
  743. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  744. atempo->position[1] == frag->position[1] + frag->nsamples) {
  745. // the current fragment is already flushed:
  746. return 0;
  747. }
  748. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  749. // finish loading the current (possibly partial) fragment:
  750. yae_load_frag(atempo, NULL, NULL);
  751. if (atempo->nfrag) {
  752. // down-mix to mono:
  753. yae_downmix(atempo, frag);
  754. // apply rDFT:
  755. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  756. // align current fragment to previous fragment:
  757. if (yae_adjust_position(atempo)) {
  758. // reload the current fragment due to adjusted position:
  759. yae_load_frag(atempo, NULL, NULL);
  760. }
  761. }
  762. }
  763. // flush the overlap region:
  764. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  765. frag->nsamples);
  766. while (atempo->position[1] < overlap_end) {
  767. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  768. return AVERROR(EAGAIN);
  769. }
  770. }
  771. // flush the remaininder of the current fragment:
  772. start_here = FFMAX(atempo->position[1], overlap_end);
  773. stop_here = frag->position[1] + frag->nsamples;
  774. offset = start_here - frag->position[1];
  775. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  776. src = frag->data + offset * atempo->stride;
  777. dst = (uint8_t *)*dst_ref;
  778. src_size = (int)(stop_here - start_here) * atempo->stride;
  779. dst_size = dst_end - dst;
  780. nbytes = FFMIN(src_size, dst_size);
  781. memcpy(dst, src, nbytes);
  782. dst += nbytes;
  783. atempo->position[1] += (nbytes / atempo->stride);
  784. // pass-back the updated destination buffer pointer:
  785. *dst_ref = (uint8_t *)dst;
  786. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  787. }
  788. static av_cold int init(AVFilterContext *ctx, const char *args)
  789. {
  790. ATempoContext *atempo = ctx->priv;
  791. // NOTE: this assumes that the caller has memset ctx->priv to 0:
  792. atempo->format = AV_SAMPLE_FMT_NONE;
  793. atempo->tempo = 1.0;
  794. atempo->state = YAE_LOAD_FRAGMENT;
  795. return args ? yae_set_tempo(ctx, args) : 0;
  796. }
  797. static av_cold void uninit(AVFilterContext *ctx)
  798. {
  799. ATempoContext *atempo = ctx->priv;
  800. yae_release_buffers(atempo);
  801. }
  802. static int query_formats(AVFilterContext *ctx)
  803. {
  804. AVFilterChannelLayouts *layouts = NULL;
  805. AVFilterFormats *formats = NULL;
  806. // WSOLA necessitates an internal sliding window ring buffer
  807. // for incoming audio stream.
  808. //
  809. // Planar sample formats are too cumbersome to store in a ring buffer,
  810. // therefore planar sample formats are not supported.
  811. //
  812. enum AVSampleFormat sample_fmts[] = {
  813. AV_SAMPLE_FMT_U8,
  814. AV_SAMPLE_FMT_S16,
  815. AV_SAMPLE_FMT_S32,
  816. AV_SAMPLE_FMT_FLT,
  817. AV_SAMPLE_FMT_DBL,
  818. AV_SAMPLE_FMT_NONE
  819. };
  820. layouts = ff_all_channel_layouts();
  821. if (!layouts) {
  822. return AVERROR(ENOMEM);
  823. }
  824. ff_set_common_channel_layouts(ctx, layouts);
  825. formats = ff_make_format_list(sample_fmts);
  826. if (!formats) {
  827. return AVERROR(ENOMEM);
  828. }
  829. ff_set_common_formats(ctx, formats);
  830. formats = ff_all_samplerates();
  831. if (!formats) {
  832. return AVERROR(ENOMEM);
  833. }
  834. ff_set_common_samplerates(ctx, formats);
  835. return 0;
  836. }
  837. static int config_props(AVFilterLink *inlink)
  838. {
  839. AVFilterContext *ctx = inlink->dst;
  840. ATempoContext *atempo = ctx->priv;
  841. enum AVSampleFormat format = inlink->format;
  842. int sample_rate = (int)inlink->sample_rate;
  843. int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
  844. return yae_reset(atempo, format, sample_rate, channels);
  845. }
  846. static void push_samples(ATempoContext *atempo,
  847. AVFilterLink *outlink,
  848. int n_out)
  849. {
  850. atempo->dst_buffer->audio->sample_rate = outlink->sample_rate;
  851. atempo->dst_buffer->audio->nb_samples = n_out;
  852. // adjust the PTS:
  853. atempo->dst_buffer->pts =
  854. av_rescale_q(atempo->nsamples_out,
  855. (AVRational){ 1, outlink->sample_rate },
  856. outlink->time_base);
  857. ff_filter_samples(outlink, atempo->dst_buffer);
  858. atempo->dst_buffer = NULL;
  859. atempo->dst = NULL;
  860. atempo->dst_end = NULL;
  861. atempo->nsamples_out += n_out;
  862. }
  863. static int filter_samples(AVFilterLink *inlink,
  864. AVFilterBufferRef *src_buffer)
  865. {
  866. AVFilterContext *ctx = inlink->dst;
  867. ATempoContext *atempo = ctx->priv;
  868. AVFilterLink *outlink = ctx->outputs[0];
  869. int n_in = src_buffer->audio->nb_samples;
  870. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  871. const uint8_t *src = src_buffer->data[0];
  872. const uint8_t *src_end = src + n_in * atempo->stride;
  873. while (src < src_end) {
  874. if (!atempo->dst_buffer) {
  875. atempo->dst_buffer = ff_get_audio_buffer(outlink,
  876. AV_PERM_WRITE,
  877. n_out);
  878. avfilter_copy_buffer_ref_props(atempo->dst_buffer, src_buffer);
  879. atempo->dst = atempo->dst_buffer->data[0];
  880. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  881. }
  882. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  883. if (atempo->dst == atempo->dst_end) {
  884. push_samples(atempo, outlink, n_out);
  885. atempo->request_fulfilled = 1;
  886. }
  887. }
  888. atempo->nsamples_in += n_in;
  889. avfilter_unref_bufferp(&src_buffer);
  890. return 0;
  891. }
  892. static int request_frame(AVFilterLink *outlink)
  893. {
  894. AVFilterContext *ctx = outlink->src;
  895. ATempoContext *atempo = ctx->priv;
  896. int ret;
  897. atempo->request_fulfilled = 0;
  898. do {
  899. ret = ff_request_frame(ctx->inputs[0]);
  900. }
  901. while (!atempo->request_fulfilled && ret >= 0);
  902. if (ret == AVERROR_EOF) {
  903. // flush the filter:
  904. int n_max = atempo->ring;
  905. int n_out;
  906. int err = AVERROR(EAGAIN);
  907. while (err == AVERROR(EAGAIN)) {
  908. if (!atempo->dst_buffer) {
  909. atempo->dst_buffer = ff_get_audio_buffer(outlink,
  910. AV_PERM_WRITE,
  911. n_max);
  912. atempo->dst = atempo->dst_buffer->data[0];
  913. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  914. }
  915. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  916. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  917. atempo->stride);
  918. if (n_out) {
  919. push_samples(atempo, outlink, n_out);
  920. }
  921. }
  922. avfilter_unref_bufferp(&atempo->dst_buffer);
  923. atempo->dst = NULL;
  924. atempo->dst_end = NULL;
  925. return AVERROR_EOF;
  926. }
  927. return ret;
  928. }
  929. static int process_command(AVFilterContext *ctx,
  930. const char *cmd,
  931. const char *arg,
  932. char *res,
  933. int res_len,
  934. int flags)
  935. {
  936. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  937. }
  938. AVFilter avfilter_af_atempo = {
  939. .name = "atempo",
  940. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  941. .init = init,
  942. .uninit = uninit,
  943. .query_formats = query_formats,
  944. .process_command = process_command,
  945. .priv_size = sizeof(ATempoContext),
  946. .inputs = (const AVFilterPad[]) {
  947. { .name = "default",
  948. .type = AVMEDIA_TYPE_AUDIO,
  949. .filter_samples = filter_samples,
  950. .config_props = config_props,
  951. .min_perms = AV_PERM_READ, },
  952. { .name = NULL}
  953. },
  954. .outputs = (const AVFilterPad[]) {
  955. { .name = "default",
  956. .request_frame = request_frame,
  957. .type = AVMEDIA_TYPE_AUDIO, },
  958. { .name = NULL}
  959. },
  960. };