asrc_flite.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. /*
  2. * Copyright (c) 2012 Stefano Sabatini
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * flite voice synth source
  23. */
  24. #include <flite/flite.h>
  25. #include "libavutil/channel_layout.h"
  26. #include "libavutil/file.h"
  27. #include "libavutil/opt.h"
  28. #include "avfilter.h"
  29. #include "audio.h"
  30. #include "formats.h"
  31. #include "internal.h"
  32. typedef struct {
  33. const AVClass *class;
  34. char *voice_str;
  35. char *textfile;
  36. char *text;
  37. cst_wave *wave;
  38. int16_t *wave_samples;
  39. int wave_nb_samples;
  40. int list_voices;
  41. cst_voice *voice;
  42. struct voice_entry *voice_entry;
  43. int64_t pts;
  44. int frame_nb_samples; ///< number of samples per frame
  45. } FliteContext;
  46. #define OFFSET(x) offsetof(FliteContext, x)
  47. #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
  48. static const AVOption flite_options[] = {
  49. { "list_voices", "list voices and exit", OFFSET(list_voices), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS },
  50. { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
  51. { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
  52. { "text", "set text to speak", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
  53. { "textfile", "set filename of the text to speak", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX, FLAGS },
  54. { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
  55. { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX, FLAGS },
  56. { NULL }
  57. };
  58. AVFILTER_DEFINE_CLASS(flite);
  59. static volatile int flite_inited = 0;
  60. /* declare functions for all the supported voices */
  61. #define DECLARE_REGISTER_VOICE_FN(name) \
  62. cst_voice *register_cmu_us_## name(const char *); \
  63. void unregister_cmu_us_## name(cst_voice *);
  64. DECLARE_REGISTER_VOICE_FN(awb);
  65. DECLARE_REGISTER_VOICE_FN(kal);
  66. DECLARE_REGISTER_VOICE_FN(kal16);
  67. DECLARE_REGISTER_VOICE_FN(rms);
  68. DECLARE_REGISTER_VOICE_FN(slt);
  69. struct voice_entry {
  70. const char *name;
  71. cst_voice * (*register_fn)(const char *);
  72. void (*unregister_fn)(cst_voice *);
  73. cst_voice *voice;
  74. unsigned usage_count;
  75. } voice_entry;
  76. #define MAKE_VOICE_STRUCTURE(voice_name) { \
  77. .name = #voice_name, \
  78. .register_fn = register_cmu_us_ ## voice_name, \
  79. .unregister_fn = unregister_cmu_us_ ## voice_name, \
  80. }
  81. static struct voice_entry voice_entries[] = {
  82. MAKE_VOICE_STRUCTURE(awb),
  83. MAKE_VOICE_STRUCTURE(kal),
  84. MAKE_VOICE_STRUCTURE(kal16),
  85. MAKE_VOICE_STRUCTURE(rms),
  86. MAKE_VOICE_STRUCTURE(slt),
  87. };
  88. static void list_voices(void *log_ctx, const char *sep)
  89. {
  90. int i, n = FF_ARRAY_ELEMS(voice_entries);
  91. for (i = 0; i < n; i++)
  92. av_log(log_ctx, AV_LOG_INFO, "%s%s",
  93. voice_entries[i].name, i < (n-1) ? sep : "\n");
  94. }
  95. static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
  96. {
  97. int i;
  98. for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
  99. struct voice_entry *entry = &voice_entries[i];
  100. if (!strcmp(entry->name, voice_name)) {
  101. if (!entry->voice)
  102. entry->voice = entry->register_fn(NULL);
  103. if (!entry->voice) {
  104. av_log(log_ctx, AV_LOG_ERROR,
  105. "Could not register voice '%s'\n", voice_name);
  106. return AVERROR_UNKNOWN;
  107. }
  108. entry->usage_count++;
  109. *entry_ret = entry;
  110. return 0;
  111. }
  112. }
  113. av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
  114. av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
  115. list_voices(log_ctx, ", ");
  116. return AVERROR(EINVAL);
  117. }
  118. static av_cold int init(AVFilterContext *ctx)
  119. {
  120. FliteContext *flite = ctx->priv;
  121. int ret = 0;
  122. if (flite->list_voices) {
  123. list_voices(ctx, "\n");
  124. return AVERROR_EXIT;
  125. }
  126. if (!flite_inited) {
  127. if (flite_init() < 0) {
  128. av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
  129. return AVERROR_UNKNOWN;
  130. }
  131. flite_inited++;
  132. }
  133. if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
  134. return ret;
  135. flite->voice = flite->voice_entry->voice;
  136. if (flite->textfile && flite->text) {
  137. av_log(ctx, AV_LOG_ERROR,
  138. "Both text and textfile options set: only one must be specified\n");
  139. return AVERROR(EINVAL);
  140. }
  141. if (flite->textfile) {
  142. uint8_t *textbuf;
  143. size_t textbuf_size;
  144. if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
  145. av_log(ctx, AV_LOG_ERROR,
  146. "The text file '%s' could not be read: %s\n",
  147. flite->textfile, av_err2str(ret));
  148. return ret;
  149. }
  150. if (!(flite->text = av_malloc(textbuf_size+1)))
  151. return AVERROR(ENOMEM);
  152. memcpy(flite->text, textbuf, textbuf_size);
  153. flite->text[textbuf_size] = 0;
  154. av_file_unmap(textbuf, textbuf_size);
  155. }
  156. if (!flite->text) {
  157. av_log(ctx, AV_LOG_ERROR,
  158. "No speech text specified, specify the 'text' or 'textfile' option\n");
  159. return AVERROR(EINVAL);
  160. }
  161. /* synth all the file data in block */
  162. flite->wave = flite_text_to_wave(flite->text, flite->voice);
  163. flite->wave_samples = flite->wave->samples;
  164. flite->wave_nb_samples = flite->wave->num_samples;
  165. return 0;
  166. }
  167. static av_cold void uninit(AVFilterContext *ctx)
  168. {
  169. FliteContext *flite = ctx->priv;
  170. if (!--flite->voice_entry->usage_count)
  171. flite->voice_entry->unregister_fn(flite->voice);
  172. flite->voice = NULL;
  173. flite->voice_entry = NULL;
  174. delete_wave(flite->wave);
  175. flite->wave = NULL;
  176. }
  177. static int query_formats(AVFilterContext *ctx)
  178. {
  179. FliteContext *flite = ctx->priv;
  180. AVFilterChannelLayouts *chlayouts = NULL;
  181. int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
  182. AVFilterFormats *sample_formats = NULL;
  183. AVFilterFormats *sample_rates = NULL;
  184. ff_add_channel_layout(&chlayouts, chlayout);
  185. ff_set_common_channel_layouts(ctx, chlayouts);
  186. ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
  187. ff_set_common_formats(ctx, sample_formats);
  188. ff_add_format(&sample_rates, flite->wave->sample_rate);
  189. ff_set_common_samplerates (ctx, sample_rates);
  190. return 0;
  191. }
  192. static int config_props(AVFilterLink *outlink)
  193. {
  194. AVFilterContext *ctx = outlink->src;
  195. FliteContext *flite = ctx->priv;
  196. outlink->sample_rate = flite->wave->sample_rate;
  197. outlink->time_base = (AVRational){1, flite->wave->sample_rate};
  198. av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
  199. flite->voice_str,
  200. av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
  201. return 0;
  202. }
  203. static int request_frame(AVFilterLink *outlink)
  204. {
  205. AVFrame *samplesref;
  206. FliteContext *flite = outlink->src->priv;
  207. int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
  208. if (!nb_samples)
  209. return AVERROR_EOF;
  210. samplesref = ff_get_audio_buffer(outlink, nb_samples);
  211. if (!samplesref)
  212. return AVERROR(ENOMEM);
  213. memcpy(samplesref->data[0], flite->wave_samples,
  214. nb_samples * flite->wave->num_channels * 2);
  215. samplesref->pts = flite->pts;
  216. av_frame_set_pkt_pos(samplesref, -1);
  217. av_frame_set_sample_rate(samplesref, flite->wave->sample_rate);
  218. flite->pts += nb_samples;
  219. flite->wave_samples += nb_samples * flite->wave->num_channels;
  220. flite->wave_nb_samples -= nb_samples;
  221. return ff_filter_frame(outlink, samplesref);
  222. }
  223. static const AVFilterPad flite_outputs[] = {
  224. {
  225. .name = "default",
  226. .type = AVMEDIA_TYPE_AUDIO,
  227. .config_props = config_props,
  228. .request_frame = request_frame,
  229. },
  230. { NULL }
  231. };
  232. AVFilter ff_asrc_flite = {
  233. .name = "flite",
  234. .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
  235. .query_formats = query_formats,
  236. .init = init,
  237. .uninit = uninit,
  238. .priv_size = sizeof(FliteContext),
  239. .inputs = NULL,
  240. .outputs = flite_outputs,
  241. .priv_class = &flite_class,
  242. };