asrc_flite.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. /*
  2. * Copyright (c) 2012 Stefano Sabatini
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * flite voice synth source
  23. */
  24. #include <flite/flite.h>
  25. #include "libavutil/audioconvert.h"
  26. #include "libavutil/file.h"
  27. #include "libavutil/opt.h"
  28. #include "avfilter.h"
  29. #include "audio.h"
  30. #include "formats.h"
  31. #include "internal.h"
  32. typedef struct {
  33. const AVClass *class;
  34. char *voice_str;
  35. char *textfile;
  36. char *text;
  37. cst_wave *wave;
  38. int16_t *wave_samples;
  39. int wave_nb_samples;
  40. int list_voices;
  41. cst_voice *voice;
  42. struct voice_entry *voice_entry;
  43. int64_t pts;
  44. int frame_nb_samples; ///< number of samples per frame
  45. } FliteContext;
  46. #define OFFSET(x) offsetof(FliteContext, x)
  47. static const AVOption flite_options[] = {
  48. { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.dbl=0}, 0, 1 },
  49. { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  50. { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
  51. { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  52. { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
  53. { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  54. { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
  55. { NULL }
  56. };
  57. AVFILTER_DEFINE_CLASS(flite);
  58. static volatile int flite_inited = 0;
  59. /* declare functions for all the supported voices */
  60. #define DECLARE_REGISTER_VOICE_FN(name) \
  61. cst_voice *register_cmu_us_## name(const char *); \
  62. void unregister_cmu_us_## name(cst_voice *);
  63. DECLARE_REGISTER_VOICE_FN(awb);
  64. DECLARE_REGISTER_VOICE_FN(kal);
  65. DECLARE_REGISTER_VOICE_FN(kal16);
  66. DECLARE_REGISTER_VOICE_FN(rms);
  67. DECLARE_REGISTER_VOICE_FN(slt);
  68. struct voice_entry {
  69. const char *name;
  70. cst_voice * (*register_fn)(const char *);
  71. void (*unregister_fn)(cst_voice *);
  72. cst_voice *voice;
  73. unsigned usage_count;
  74. } voice_entry;
  75. #define MAKE_VOICE_STRUCTURE(voice_name) { \
  76. .name = #voice_name, \
  77. .register_fn = register_cmu_us_ ## voice_name, \
  78. .unregister_fn = unregister_cmu_us_ ## voice_name, \
  79. }
  80. static struct voice_entry voice_entries[] = {
  81. MAKE_VOICE_STRUCTURE(awb),
  82. MAKE_VOICE_STRUCTURE(kal),
  83. MAKE_VOICE_STRUCTURE(kal16),
  84. MAKE_VOICE_STRUCTURE(rms),
  85. MAKE_VOICE_STRUCTURE(slt),
  86. };
  87. static void list_voices(void *log_ctx, const char *sep)
  88. {
  89. int i, n = FF_ARRAY_ELEMS(voice_entries);
  90. for (i = 0; i < n; i++)
  91. av_log(log_ctx, AV_LOG_INFO, "%s%s",
  92. voice_entries[i].name, i < (n-1) ? sep : "\n");
  93. }
  94. static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
  95. {
  96. int i;
  97. for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
  98. struct voice_entry *entry = &voice_entries[i];
  99. if (!strcmp(entry->name, voice_name)) {
  100. if (!entry->voice)
  101. entry->voice = entry->register_fn(NULL);
  102. if (!entry->voice) {
  103. av_log(log_ctx, AV_LOG_ERROR,
  104. "Could not register voice '%s'\n", voice_name);
  105. return AVERROR_UNKNOWN;
  106. }
  107. entry->usage_count++;
  108. *entry_ret = entry;
  109. return 0;
  110. }
  111. }
  112. av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
  113. av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
  114. list_voices(log_ctx, ", ");
  115. return AVERROR(EINVAL);
  116. }
  117. static av_cold int init(AVFilterContext *ctx, const char *args)
  118. {
  119. FliteContext *flite = ctx->priv;
  120. int ret = 0;
  121. flite->class = &flite_class;
  122. av_opt_set_defaults(flite);
  123. if ((ret = av_set_options_string(flite, args, "=", ":")) < 0) {
  124. av_log(ctx, AV_LOG_ERROR, "Error parsing options string: '%s'\n", args);
  125. return ret;
  126. }
  127. if (flite->list_voices) {
  128. list_voices(ctx, "\n");
  129. return AVERROR_EXIT;
  130. }
  131. if (!flite_inited) {
  132. if (flite_init() < 0) {
  133. av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
  134. return AVERROR_UNKNOWN;
  135. }
  136. flite_inited++;
  137. }
  138. if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
  139. return ret;
  140. flite->voice = flite->voice_entry->voice;
  141. if (flite->textfile && flite->text) {
  142. av_log(ctx, AV_LOG_ERROR,
  143. "Both text and textfile options set: only one must be specified\n");
  144. return AVERROR(EINVAL);
  145. }
  146. if (flite->textfile) {
  147. uint8_t *textbuf;
  148. size_t textbuf_size;
  149. if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
  150. av_log(ctx, AV_LOG_ERROR,
  151. "The text file '%s' could not be read: %s\n",
  152. flite->textfile, av_err2str(ret));
  153. return ret;
  154. }
  155. if (!(flite->text = av_malloc(textbuf_size+1)))
  156. return AVERROR(ENOMEM);
  157. memcpy(flite->text, textbuf, textbuf_size);
  158. flite->text[textbuf_size] = 0;
  159. av_file_unmap(textbuf, textbuf_size);
  160. }
  161. if (!flite->text) {
  162. av_log(ctx, AV_LOG_ERROR,
  163. "No speech text specified, specify the 'text' or 'textfile' option\n");
  164. return AVERROR(EINVAL);
  165. }
  166. /* synth all the file data in block */
  167. flite->wave = flite_text_to_wave(flite->text, flite->voice);
  168. flite->wave_samples = flite->wave->samples;
  169. flite->wave_nb_samples = flite->wave->num_samples;
  170. return 0;
  171. }
  172. static av_cold void uninit(AVFilterContext *ctx)
  173. {
  174. FliteContext *flite = ctx->priv;
  175. av_opt_free(flite);
  176. if (!--flite->voice_entry->usage_count)
  177. flite->voice_entry->unregister_fn(flite->voice);
  178. flite->voice = NULL;
  179. flite->voice_entry = NULL;
  180. delete_wave(flite->wave);
  181. flite->wave = NULL;
  182. }
  183. static int query_formats(AVFilterContext *ctx)
  184. {
  185. FliteContext *flite = ctx->priv;
  186. AVFilterChannelLayouts *chlayouts = NULL;
  187. int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
  188. AVFilterFormats *sample_formats = NULL;
  189. AVFilterFormats *sample_rates = NULL;
  190. ff_add_channel_layout(&chlayouts, chlayout);
  191. ff_set_common_channel_layouts(ctx, chlayouts);
  192. ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
  193. ff_set_common_formats(ctx, sample_formats);
  194. ff_add_format(&sample_rates, flite->wave->sample_rate);
  195. ff_set_common_samplerates (ctx, sample_rates);
  196. return 0;
  197. }
  198. static int config_props(AVFilterLink *outlink)
  199. {
  200. AVFilterContext *ctx = outlink->src;
  201. FliteContext *flite = ctx->priv;
  202. outlink->sample_rate = flite->wave->sample_rate;
  203. outlink->time_base = (AVRational){1, flite->wave->sample_rate};
  204. av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
  205. flite->voice_str,
  206. av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
  207. return 0;
  208. }
  209. static int request_frame(AVFilterLink *outlink)
  210. {
  211. AVFilterBufferRef *samplesref;
  212. FliteContext *flite = outlink->src->priv;
  213. int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
  214. if (!nb_samples)
  215. return AVERROR_EOF;
  216. samplesref = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
  217. if (!samplesref)
  218. return AVERROR(ENOMEM);
  219. memcpy(samplesref->data[0], flite->wave_samples,
  220. nb_samples * flite->wave->num_channels * 2);
  221. samplesref->pts = flite->pts;
  222. samplesref->pos = -1;
  223. samplesref->audio->sample_rate = flite->wave->sample_rate;
  224. flite->pts += nb_samples;
  225. flite->wave_samples += nb_samples * flite->wave->num_channels;
  226. flite->wave_nb_samples -= nb_samples;
  227. return ff_filter_samples(outlink, samplesref);
  228. }
  229. AVFilter avfilter_asrc_flite = {
  230. .name = "flite",
  231. .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
  232. .query_formats = query_formats,
  233. .init = init,
  234. .uninit = uninit,
  235. .priv_size = sizeof(FliteContext),
  236. .inputs = (const AVFilterPad[]) {{ .name = NULL}},
  237. .outputs = (const AVFilterPad[]) {
  238. {
  239. .name = "default",
  240. .type = AVMEDIA_TYPE_AUDIO,
  241. .config_props = config_props,
  242. .request_frame = request_frame,
  243. },
  244. { .name = NULL }
  245. },
  246. };