/* * alsa audio handling * * Written in 2010-2020 by Andy Green * * This file is made available under the Creative Commons CC0 1.0 * Universal Public Domain Dedication. */ #include #include #include #include #include #include #include #include #include #include "private.h" extern struct lws_ss_handle *hss_avs_event, *hss_avs_sync; int avs_query_start(struct lws_context *context); enum { MODE_IDLE, MODE_CAPTURING, MODE_PLAYING }; struct raw_vhd { int16_t p[8 * 1024]; /* 500ms at 16kHz 16-bit PCM */ pv_porcupine_object_t *porc; snd_pcm_t *pcm_capture; snd_pcm_t *pcm_playback; snd_pcm_hw_params_t *params; snd_pcm_uframes_t frames; int16_t *porcbuf; mpg123_handle *mh; mp3_done_cb done_cb; void *opaque; int mode; int rate; int porc_spf; int filefd; int rpos; int wpos; int porcpos; int npos; int times; int quietcount; int anycount; int wplay; int rplay; char last_wake_detect; char destroy_mh_on_drain; }; static struct raw_vhd *avhd; /* * called from alexa.c to grab the next chunk of audio capture buffer * for upload */ int spool_capture(uint8_t *buf, size_t len) { int16_t *sam = (int16_t *)buf; size_t s, os; if (avhd->mode != MODE_CAPTURING) return -1; os = s = len / 2; while (s && avhd->wpos != avhd->npos) { *sam++ = avhd->p[avhd->npos]; avhd->npos = (avhd->npos + 1) % LWS_ARRAY_SIZE(avhd->p); s--; } lwsl_info("Copied %d samples (%d %d)\n", (int)(os - s), avhd->wpos, avhd->npos); return (os - s) * 2; } /* * Called from alexa.c to control when the mp3 playback should begin and end */ int play_mp3(mpg123_handle *mh, mp3_done_cb cb, void *opaque) { if (mh) { avhd->mh = mh; avhd->mode = MODE_PLAYING; snd_pcm_prepare(avhd->pcm_playback); return 0; } avhd->destroy_mh_on_drain = 1; avhd->done_cb = cb; avhd->opaque = opaque; return 0; } /* * Helper used to set alsa hwparams on both capture and playback channels */ static int set_hw_params(struct lws_vhost *vh, snd_pcm_t **pcm, int type) { unsigned int rate = pv_sample_rate(); /* it's 16kHz */ snd_pcm_hw_params_t *params; lws_sock_file_fd_type u; struct pollfd pfd; struct lws *wsi1; int n; n = snd_pcm_open(pcm, "default", type, SND_PCM_NONBLOCK); if (n < 0) { lwsl_err("%s: Can't open default for playback: %s\n", __func__, snd_strerror(n)); return -1; } if (snd_pcm_poll_descriptors(*pcm, &pfd, 1) != 1) { lwsl_err("%s: failed to get playback desc\n", __func__); return -1; } u.filefd = (lws_filefd_type)(long long)pfd.fd; wsi1 = lws_adopt_descriptor_vhost(vh, LWS_ADOPT_RAW_FILE_DESC, u, "lws-audio-test", NULL); if (!wsi1) { lwsl_err("%s: Failed to adopt playback desc\n", __func__); goto bail; } if (type == SND_PCM_STREAM_PLAYBACK) lws_rx_flow_control(wsi1, 0); /* no POLLIN */ snd_pcm_hw_params_malloc(¶ms); snd_pcm_hw_params_any(*pcm, params); n = snd_pcm_hw_params_set_access(*pcm, params, SND_PCM_ACCESS_RW_INTERLEAVED); if (n < 0) goto bail1; n = snd_pcm_hw_params_set_format(*pcm, params, SND_PCM_FORMAT_S16_LE); if (n < 0) goto bail1; n = snd_pcm_hw_params_set_channels(*pcm, params, 1); if (n < 0) goto bail1; n = snd_pcm_hw_params_set_rate_near(*pcm, params, &rate, 0); if (n < 0) goto bail1; lwsl_notice("%s: %s rate %d\n", __func__, type == SND_PCM_STREAM_PLAYBACK ? "Playback" : "Capture", rate); n = snd_pcm_hw_params(*pcm, params); snd_pcm_hw_params_free(params); if (n < 0) goto bail; return 0; bail1: snd_pcm_hw_params_free(params); bail: lwsl_err("%s: Set hw params failed: %s\n", __func__, snd_strerror(n)); return -1; } /* * The lws RAW file protocol handler that wraps ALSA. * * The timing is coming from ALSA capture channel... since they are both set to * 16kHz, it's enough just to have the one. */ static int callback_audio(struct lws *wsi, enum lws_callback_reasons reason, void *user, void *in, size_t len) { struct raw_vhd *vhd = (struct raw_vhd *)lws_protocol_vh_priv_get( lws_get_vhost(wsi), lws_get_protocol(wsi)); uint16_t rands[50]; int16_t temp[256]; bool det; long avg; int n, s; switch (reason) { case LWS_CALLBACK_PROTOCOL_INIT: if (avhd) /* just on one vhost */ return 0; avhd = vhd = lws_protocol_vh_priv_zalloc(lws_get_vhost(wsi), lws_get_protocol(wsi), sizeof(struct raw_vhd)); /* * Set up the wakeword library */ n = pv_porcupine_init("porcupine_params.pv", "alexa_linux.ppn", 1.0, &vhd->porc); if (n) { lwsl_err("%s: porcupine init fail %d\n", __func__, n); return -1; } vhd->porc_spf = pv_porcupine_frame_length(); vhd->porcbuf = malloc(vhd->porc_spf * 2); lwsl_info("%s: %s porc frame length is %d samples\n", __func__, lws_get_vhost_name(lws_get_vhost(wsi)), vhd->porc_spf); vhd->rate = pv_sample_rate(); /* 16kHz */ /* set up alsa */ if (set_hw_params(lws_get_vhost(wsi), &vhd->pcm_playback, SND_PCM_STREAM_PLAYBACK)) { lwsl_err("%s: Can't open default for playback\n", __func__); return -1; } if (set_hw_params(lws_get_vhost(wsi), &vhd->pcm_capture, SND_PCM_STREAM_CAPTURE)) { lwsl_err("%s: Can't open default for capture\n", __func__); return -1; } snd_config_update_free_global(); break; case LWS_CALLBACK_PROTOCOL_DESTROY: lwsl_info("%s: LWS_CALLBACK_PROTOCOL_DESTROY\n", __func__); if (!vhd) break; if (vhd->porcbuf) { free(vhd->porcbuf); vhd->porcbuf = NULL; } if (vhd->pcm_playback) { snd_pcm_drop(vhd->pcm_playback); snd_pcm_close(vhd->pcm_playback); vhd->pcm_playback = NULL; } if (vhd->pcm_capture) { snd_pcm_drop(vhd->pcm_capture); snd_pcm_close(vhd->pcm_capture); vhd->pcm_capture = NULL; } if (vhd->porc) { pv_porcupine_delete(vhd->porc); vhd->porc = NULL; } /* avoid most of the valgrind mess from alsa */ snd_config_update_free_global(); break; case LWS_CALLBACK_RAW_CLOSE_FILE: lwsl_info("%s: closed\n", __func__); break; case LWS_CALLBACK_RAW_RX_FILE: /* we come here about every 250ms */ /* * Playing back the mp3? */ if (vhd->mode == MODE_PLAYING && vhd->mh) { size_t amt, try; do { try = snd_pcm_avail(vhd->pcm_playback); if (try > LWS_ARRAY_SIZE(vhd->p)) try = LWS_ARRAY_SIZE(vhd->p); n = mpg123_read(vhd->mh, (uint8_t *)vhd->p, try * 2, &amt); lwsl_info("%s: PLAYING: mpg123 read %d, n %d\n", __func__, (int)amt, n); if (n == MPG123_NEW_FORMAT) { snd_pcm_start(vhd->pcm_playback); memset(vhd->p, 0, try); snd_pcm_writei(vhd->pcm_playback, vhd->p, try / 2); snd_pcm_prepare(vhd->pcm_playback); } } while (n == MPG123_NEW_FORMAT); if (amt) { n = snd_pcm_writei(vhd->pcm_playback, vhd->p, amt / 2); if (n < 0) lwsl_notice("%s: snd_pcm_writei: %d %s\n", __func__, n, snd_strerror(n)); if (n == -EPIPE) { lwsl_err("%s: did EPIPE prep\n", __func__); snd_pcm_prepare(vhd->pcm_playback); } } else if (vhd->destroy_mh_on_drain && n != MPG123_NEW_FORMAT) { snd_pcm_drain(vhd->pcm_playback); vhd->destroy_mh_on_drain = 0; lwsl_notice("%s: mp3 destroyed\n", __func__); mpg123_close(vhd->mh); mpg123_delete(vhd->mh); vhd->mh = NULL; vhd->mode = MODE_IDLE; if (vhd->done_cb) vhd->done_cb(vhd->opaque); } } /* * Get the capture data */ n = snd_pcm_readi(vhd->pcm_capture, temp, LWS_ARRAY_SIZE(temp)); s = 0; while (s < n) { vhd->p[(vhd->wpos + s) % LWS_ARRAY_SIZE(vhd->p)] = temp[s]; s++; } if (vhd->mode == MODE_CAPTURING) { /* * We are recording an utterance. * * Estimate the sound density in the frame by picking 50 * samples at random and averaging the sampled * [abs()^2] / 10000 to create a Figure of Merit. * * Speaking on my laptop gets us 1000 - 5000, silence * is typ under 30. The wakeword tells us there was * speech at the start, end the capture when there's * ~750ms (12000 samples) under 125 FOM. */ #define SILENCE_THRESH 125 avg = 0; lws_get_random(lws_get_context(wsi), rands, sizeof(rands)); for (s = 0; s < (int)LWS_ARRAY_SIZE(rands); s++) { long q; q = temp[rands[s] % n]; avg += (q * q); } avg = (avg / (int)LWS_ARRAY_SIZE(rands)) / 10000; lwsl_notice("est audio energy: %ld %d\n", avg, vhd->mode); /* * Only start looking for "silence" after 1.5s, in case * he does a long pause after the wakeword */ if (vhd->anycount < (3 *vhd->rate) / 2 && avg < SILENCE_THRESH) { vhd->quietcount += n; /* then 500ms of "silence" does it for us */ if (vhd->quietcount >= ((vhd->rate * 3) / 4)) { lwsl_warn("%s: ended capture\n", __func__); vhd->mode = MODE_IDLE; vhd->quietcount = 0; } } /* if we're not "silent", reset the count */ if (avg > SILENCE_THRESH * 2) vhd->quietcount = 0; /* * Since we are in capturing mode, we have something * new to send now. * * We must send an extra one at the end so we can finish * the tx. */ lws_ss_request_tx(hss_avs_sync); } /* * Just waiting for a wakeword */ while (vhd->mode == MODE_IDLE) { int m = 0, ppold = vhd->porcpos; s = (vhd->wpos - vhd->porcpos) % LWS_ARRAY_SIZE(vhd->p); if (s < vhd->porc_spf) goto eol; while (m < vhd->porc_spf) { vhd->porcbuf[m++] = avhd->p[vhd->porcpos]; vhd->porcpos = (vhd->porcpos + 1) % LWS_ARRAY_SIZE(vhd->p); } if (pv_porcupine_process(vhd->porc, vhd->porcbuf, &det)) lwsl_err("%s: porc_process failed\n", __func__); if (!det && vhd->last_wake_detect && vhd->mode == MODE_IDLE) { lwsl_warn("************* Wakeword\n"); if (!avs_query_start(lws_get_context(wsi))) { vhd->mode = MODE_CAPTURING; vhd->quietcount = 0; vhd->last_wake_detect = det; vhd->npos = ppold; break; } } vhd->last_wake_detect = det; } eol: vhd->wpos = (vhd->wpos + n) % LWS_ARRAY_SIZE(vhd->p); break; default: break; } return 0; } struct lws_protocols protocol_audio_test = { "lws-audio-test", callback_audio, 0, 0 };