From 9f5e91ade24ff4e4e7ac77d60a33f05445201323 Mon Sep 17 00:00:00 2001 From: Hongyu Li Date: Fri, 18 Mar 2022 17:05:40 -0400 Subject: [PATCH] moved respeak test node here --- .../launch/audio.launch | 5 +- .../launch/audio_test.launch | 4 + .../msg/MyAudioData.msg | 2 +- .../scripts/ros_interface.py | 16 +- .../scripts/stretch_respeak_test2.py | 263 ++++++++++++++++++ 5 files changed, 280 insertions(+), 10 deletions(-) create mode 100644 vz_acoustic_scene_analysis/launch/audio_test.launch create mode 100644 vz_acoustic_scene_analysis/scripts/stretch_respeak_test2.py diff --git a/vz_acoustic_scene_analysis/launch/audio.launch b/vz_acoustic_scene_analysis/launch/audio.launch index e954e2c..2912b46 100644 --- a/vz_acoustic_scene_analysis/launch/audio.launch +++ b/vz_acoustic_scene_analysis/launch/audio.launch @@ -1,13 +1,12 @@ - + - - + diff --git a/vz_acoustic_scene_analysis/launch/audio_test.launch b/vz_acoustic_scene_analysis/launch/audio_test.launch new file mode 100644 index 0000000..eab5a03 --- /dev/null +++ b/vz_acoustic_scene_analysis/launch/audio_test.launch @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/vz_acoustic_scene_analysis/msg/MyAudioData.msg b/vz_acoustic_scene_analysis/msg/MyAudioData.msg index 38c1e43..1735052 100644 --- a/vz_acoustic_scene_analysis/msg/MyAudioData.msg +++ b/vz_acoustic_scene_analysis/msg/MyAudioData.msg @@ -1 +1 @@ -uint16[] data \ No newline at end of file +uint8[] data \ No newline at end of file diff --git a/vz_acoustic_scene_analysis/scripts/ros_interface.py b/vz_acoustic_scene_analysis/scripts/ros_interface.py index 959d36f..b18612e 100755 --- a/vz_acoustic_scene_analysis/scripts/ros_interface.py +++ b/vz_acoustic_scene_analysis/scripts/ros_interface.py @@ -23,7 +23,7 @@ class RosInterface: # self.maxSize = 7 # self.queue = [None] * 7 # self.head = self.tail = -1 - self.nparray = np.empty(1) + self.wav_data = [] self.arraylength = 0 self.msg_count = 0 @@ -53,9 +53,11 @@ class RosInterface: def raw_callback(self, msg): # print("Length of uint8[]:", len(msg.data)) - if (self.msg_count < 50): - self.arraylength += len(msg.data) - np.append(self.nparray,bytes) + self.wav_data.append(msg.data) + + # if (self.msg_count < 10000): + # self.arraylength += len(msg.data) + # print(self.nparray) # print(len(bytes)) # else : # self.byteArray[self.msg_count] = bytes @@ -63,8 +65,10 @@ class RosInterface: self.msg_count += 1 def on_shutdown(self): - print(str(self.arraylength)) - write(self.save_dir +'test.wav', self.arraylength, self.nparray) + wav_arr = np.array(self.wav_data) + print(wav_arr) + print(wav_arr.shape) + write(self.save_dir +'test.mp3', 44100, wav_arr) print("check music") pass diff --git a/vz_acoustic_scene_analysis/scripts/stretch_respeak_test2.py b/vz_acoustic_scene_analysis/scripts/stretch_respeak_test2.py new file mode 100644 index 0000000..b971055 --- /dev/null +++ b/vz_acoustic_scene_analysis/scripts/stretch_respeak_test2.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +from __future__ import print_function +import pyaudio +import wave +import numpy as np +import usb.core +import struct +import time +import os +import sys +import rospy +from contextlib import contextmanager +import stretch_body.hello_utils as hu +hu.print_stretch_re_use() + +# what does this mean +@contextmanager +def ignore_stderr(): + devnull = None + try: + devnull = os.open(os.devnull, os.O_WRONLY) + stderr = os.dup(2) + sys.stderr.flush() + os.dup2(devnull, 2) + try: + yield + finally: + os.dup2(stderr, 2) + os.close(stderr) + finally: + if devnull is not None: + os.close(devnull) + + +# parameter list +# name: (id, offset, type, max, min , r/w, info) +PARAMETERS = { + 'AECFREEZEONOFF': (18, 7, 'int', 1, 0, 'rw', 'Adaptive Echo Canceler updates inhibit.', '0 = Adaptation enabled', '1 = Freeze adaptation, filter only'), + 'AECNORM': (18, 19, 'float', 16, 0.25, 'rw', 'Limit on norm of AEC filter coefficients'), + 'AECPATHCHANGE': (18, 25, 'int', 1, 0, 'ro', 'AEC Path Change Detection.', '0 = false (no path change detected)', '1 = true (path change detected)'), + 'RT60': (18, 26, 'float', 0.9, 0.25, 'ro', 'Current RT60 estimate in seconds'), + 'HPFONOFF': (18, 27, 'int', 3, 0, 'rw', 'High-pass Filter on microphone signals.', '0 = OFF', '1 = ON - 70 Hz cut-off', '2 = ON - 125 Hz cut-off', '3 = ON - 180 Hz cut-off'), + 'RT60ONOFF': (18, 28, 'int', 1, 0, 'rw', 'RT60 Estimation for AES. 0 = OFF 1 = ON'), + 'AECSILENCELEVEL': (18, 30, 'float', 1, 1e-09, 'rw', 'Threshold for signal detection in AEC [-inf .. 0] dBov (Default: -80dBov = 10log10(1x10-8))'), + 'AECSILENCEMODE': (18, 31, 'int', 1, 0, 'ro', 'AEC far-end silence detection status. ', '0 = false (signal detected) ', '1 = true (silence detected)'), + 'AGCONOFF': (19, 0, 'int', 1, 0, 'rw', 'Automatic Gain Control. ', '0 = OFF ', '1 = ON'), + 'AGCMAXGAIN': (19, 1, 'float', 1000, 1, 'rw', 'Maximum AGC gain factor. ', '[0 .. 60] dB (default 30dB = 20log10(31.6))'), + 'AGCDESIREDLEVEL': (19, 2, 'float', 0.99, 1e-08, 'rw', 'Target power level of the output signal. ', '[-inf .. 0] dBov (default: -23dBov = 10log10(0.005))'), + 'AGCGAIN': (19, 3, 'float', 1000, 1, 'rw', 'Current AGC gain factor. ', '[0 .. 60] dB (default: 0.0dB = 20log10(1.0))'), + 'AGCTIME': (19, 4, 'float', 1, 0.1, 'rw', 'Ramps-up / down time-constant in seconds.'), + 'CNIONOFF': (19, 5, 'int', 1, 0, 'rw', 'Comfort Noise Insertion.', '0 = OFF', '1 = ON'), + 'FREEZEONOFF': (19, 6, 'int', 1, 0, 'rw', 'Adaptive beamformer updates.', '0 = Adaptation enabled', '1 = Freeze adaptation, filter only'), + 'STATNOISEONOFF': (19, 8, 'int', 1, 0, 'rw', 'Stationary noise suppression.', '0 = OFF', '1 = ON'), + 'GAMMA_NS': (19, 9, 'float', 3, 0, 'rw', 'Over-subtraction factor of stationary noise. min .. max attenuation'), + 'MIN_NS': (19, 10, 'float', 1, 0, 'rw', 'Gain-floor for stationary noise suppression.', '[-inf .. 0] dB (default: -16dB = 20log10(0.15))'), + 'NONSTATNOISEONOFF': (19, 11, 'int', 1, 0, 'rw', 'Non-stationary noise suppression.', '0 = OFF', '1 = ON'), + 'GAMMA_NN': (19, 12, 'float', 3, 0, 'rw', 'Over-subtraction factor of non- stationary noise. min .. max attenuation'), + 'MIN_NN': (19, 13, 'float', 1, 0, 'rw', 'Gain-floor for non-stationary noise suppression.', '[-inf .. 0] dB (default: -10dB = 20log10(0.3))'), + 'ECHOONOFF': (19, 14, 'int', 1, 0, 'rw', 'Echo suppression.', '0 = OFF', '1 = ON'), + 'GAMMA_E': (19, 15, 'float', 3, 0, 'rw', 'Over-subtraction factor of echo (direct and early components). min .. max attenuation'), + 'GAMMA_ETAIL': (19, 16, 'float', 3, 0, 'rw', 'Over-subtraction factor of echo (tail components). min .. max attenuation'), + 'GAMMA_ENL': (19, 17, 'float', 5, 0, 'rw', 'Over-subtraction factor of non-linear echo. min .. max attenuation'), + 'NLATTENONOFF': (19, 18, 'int', 1, 0, 'rw', 'Non-Linear echo attenuation.', '0 = OFF', '1 = ON'), + 'NLAEC_MODE': (19, 20, 'int', 2, 0, 'rw', 'Non-Linear AEC training mode.', '0 = OFF', '1 = ON - phase 1', '2 = ON - phase 2'), + 'SPEECHDETECTED': (19, 22, 'int', 1, 0, 'ro', 'Speech detection status.', '0 = false (no speech detected)', '1 = true (speech detected)'), + 'FSBUPDATED': (19, 23, 'int', 1, 0, 'ro', 'FSB Update Decision.', '0 = false (FSB was not updated)', '1 = true (FSB was updated)'), + 'FSBPATHCHANGE': (19, 24, 'int', 1, 0, 'ro', 'FSB Path Change Detection.', '0 = false (no path change detected)', '1 = true (path change detected)'), + 'TRANSIENTONOFF': (19, 29, 'int', 1, 0, 'rw', 'Transient echo suppression.', '0 = OFF', '1 = ON'), + 'VOICEACTIVITY': (19, 32, 'int', 1, 0, 'ro', 'VAD voice activity status.', '0 = false (no voice activity)', '1 = true (voice activity)'), + 'STATNOISEONOFF_SR': (19, 33, 'int', 1, 0, 'rw', 'Stationary noise suppression for ASR.', '0 = OFF', '1 = ON'), + 'NONSTATNOISEONOFF_SR': (19, 34, 'int', 1, 0, 'rw', 'Non-stationary noise suppression for ASR.', '0 = OFF', '1 = ON'), + 'GAMMA_NS_SR': (19, 35, 'float', 3, 0, 'rw', 'Over-subtraction factor of stationary noise for ASR. ', '[0.0 .. 3.0] (default: 1.0)'), + 'GAMMA_NN_SR': (19, 36, 'float', 3, 0, 'rw', 'Over-subtraction factor of non-stationary noise for ASR. ', '[0.0 .. 3.0] (default: 1.1)'), + 'MIN_NS_SR': (19, 37, 'float', 1, 0, 'rw', 'Gain-floor for stationary noise suppression for ASR.', '[-inf .. 0] dB (default: -16dB = 20log10(0.15))'), + 'MIN_NN_SR': (19, 38, 'float', 1, 0, 'rw', 'Gain-floor for non-stationary noise suppression for ASR.', '[-inf .. 0] dB (default: -10dB = 20log10(0.3))'), + 'GAMMAVAD_SR': (19, 39, 'float', 1000, 0, 'rw', 'Set the threshold for voice activity detection.', '[-inf .. 60] dB (default: 3.5dB 20log10(1.5))'), + # 'KEYWORDDETECT': (20, 0, 'int', 1, 0, 'ro', 'Keyword detected. Current value so needs polling.'), + 'DOAANGLE': (21, 0, 'int', 359, 0, 'ro', 'DOA angle. Current value. Orientation depends on build configuration.') +} + + +class Tuning: + TIMEOUT = 100000 + + # what is dev? + def __init__(self): + self.TIMEOUT = 100000 + self.dev = usb.core.find(idVendor=0x2886, idProduct=0x0018) + + if not self.dev: + raise RuntimeError("Failed to find Respeaker device") + rospy.loginfo("Initializing Respeaker device") + # Initialize inputs and outputs to other module code + self.audio_signal = None + self.prediction = None + # self.param = rospy.get_param('') + + + def write(self, name, value): + try: + data = PARAMETERS[name] + except KeyError: + return + + if data[5] == 'ro': + raise ValueError('{} is read-only'.format(name)) + + id = data[0] + + # 4 bytes offset, 4 bytes value, 4 bytes type + if data[2] == 'int': + payload = struct.pack(b'iii', data[1], int(value), 1) + else: + payload = struct.pack(b'ifi', data[1], float(value), 0) + + self.dev.ctrl_transfer( + usb.util.CTRL_OUT | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE, + 0, 0, id, payload, self.TIMEOUT) + + def read(self, name): + try: + data = PARAMETERS[name] + except KeyError: + return + + id = data[0] + + # not sure what is happening here + cmd = 0x80 | data[1] + if data[2] == 'int': + cmd |= 0x40 + + length = 8 + + response = self.dev.ctrl_transfer( + usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE, + 0, cmd, id, length, self.TIMEOUT) + + response = struct.unpack(b'ii', response.tobytes()) + + if data[2] == 'int': + result = response[0] + else: + result = response[0] * (2.**response[1]) + + return result + + def set_vad_threshold(self, db): + self.write('GAMMAVAD_SR', db) + + def is_voice(self): + return self.read('VOICEACTIVITY') + + @property + def direction(self): + return self.read('DOAANGLE') + + @property + def version(self): + return self.dev.ctrl_transfer( + usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE, + 0, 0x80, 0, 1, self.TIMEOUT)[0] + + def close(self): + """ + close the interface + """ + usb.util.dispose_resources(self.dev) + + +def get_respeaker_device_id(): + + with ignore_stderr(): + p = pyaudio.PyAudio() + info = p.get_host_api_info_by_index(0) + num_devices = info.get('deviceCount') + + device_id = -1 + for i in range(num_devices): + if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0: + if "ReSpeaker" in p.get_device_info_by_host_api_device_index(0, i).get('name'): + device_id = i + + return device_id + + + +RESPEAKER_RATE = 16000 +RESPEAKER_CHANNELS = 6 # must flash 6_channels_firmware.bin first +RESPEAKER_WIDTH = 2 +RESPEAKER_INDEX = get_respeaker_device_id() +CHUNK = 1024 + + +def record_audio(seconds=5): + p = pyaudio.PyAudio() + stream = p.open(rate=RESPEAKER_RATE, + format=p.get_format_from_width(RESPEAKER_WIDTH), + channels=RESPEAKER_CHANNELS, + input=True, + input_device_index=RESPEAKER_INDEX, + output= False) + + frames = [] + for i in range(0, int(RESPEAKER_RATE / CHUNK * seconds)): + data = stream.read(CHUNK) + a = np.frombuffer(data,dtype=np.int16)[0::6] # extracts fused channel 0 + frames.append(a.tobytes()) + + stream.stop_stream() + stream.close() + p.terminate() + + return frames + + +def save_wav(frames, fname): + p = pyaudio.PyAudio() + wf = wave.open(fname, 'wb') + wf.setnchannels(1) + wf.setsampwidth(p.get_sample_size(p.get_format_from_width(RESPEAKER_WIDTH))) + wf.setframerate(RESPEAKER_RATE) + wf.writeframes(b''.join(frames)) + wf.close() + + +def run_mic(respeaker, num, printed_wait_statement): + if not printed_wait_statement and respeaker.is_voice() == 0: + print("\n* waiting for audio...") + printed_wait_statement = True + else: + if respeaker.is_voice() == 1: + print("* recording 5 seconds") + frames = record_audio() + print("* done") + # save_wav(frames, "/home/hello-robot/Desktop/output_audio_" + str(num) + ".wav") + num_files += 1 + # send test.wav files + print("* done") + printed_wait_statement = False + + +if __name__ == "__main__": + try: + rospy.init_node("audio_capture") + audio_ctrl = Tuning() + num_files = 0 + dev = usb.core.find(idVendor=0x2886, idProduct=0x0018) + try: + printed_wait_statement = False + if dev: + respeaker = Tuning() + while True: + try: + rospy.Timer(rospy.Duration(0.2), run_mic(respeaker, num_files, printed_wait_statement)) + except KeyboardInterrupt: + break + except usb.core.USBError: + print('Respeaker not on USB bus') + except rospy.ROSInterruptException: + print('Audio processing node failed!') + pass