I've synthetized a spoken clock audio signal in python (code below) and did a proof of concept in Kdenlive 21.03.70: it works! [EDIT: Not so finally, 8-( I did a test on an hour long recording and it missed completely at the hour count)] At this link you can listen to a soundfile mixed in Kdenlive after it successfully aligned the two sounds. The file is only mono (left channel) at the beginning, but at 10 seconds in you can ear (right channel) the test track aligned within 7 ms by Kdenlive
So the hardware is as simple as a SAMD21 board (10USD) and a GPS (20 to 30 USD, depending on the retailer)! Stay tuned for a working prototype!
""" this sloppy program outputs a WAV file (named TEST_FILE) consisting of 20 short 2 kHz bips, each followed by the corresponding spoken time (HHMMSS) a sample WAV output file is presented there https://mamot.fr/@lutzray/105232599071636881 The time is simply incremented and nothing is done for assuring the validity of the syncing bip position, it's a proof of concept for testing NLE video editing programs and synthetic voices sox is used for merging WAV files and should be available on your path 63 sound clips are necessary and should be in the VOICE_CLIPS_REP rep. The files are 0 to 59 spoken numbers + 3 spoken words. They must be named: 0.wav 1.wav ... 59.wav heure.wav minute.wav seconde.wav French sound files are momentarily (nov 2020) accessible at this URL https://famille.lutz.quebec/index.php/s/bTKjTYC88ke7Ykp ongoing project documented at https://atomicsynchronator.blogspot.com/ """ import numpy as np import soundfile as sf import os.path, datetime VOICE_CLIPS_REP = 'libtts/' # see description above TEST_FILE = 'bipPlusVoices.wav' # output INTERVAL_BETWEEN_BIPS = 5 # in secs SYNC_BIP_FREQUENCY = 2000 # in Hertz SYNC_BIP_LENGTH = 50 # in whole cycles INTERVAL_BEFORE_VOICE = 0.5 # in seconds hh, mm, ss = ('heure','minute','seconde') SAMPLING_RATE =44100 # per seconds AMPL = 0.97 # 1.0 is detected as a clipped soundfile by some editing programs def gen_sinus(amp, freq, duration): # duration in cycles period = 1/freq length = duration*period # in secs n_samples = int(SAMPLING_RATE*length) t = np.linspace(0, length, n_samples) return amp*np.sin(2*np.pi*freq*t) def append_data_to_WAV(audio_data, existing_snd_fn): # creates it if needed new_sounds_fn = '/tmp/newsounds.wav' sox_output_fn = '/tmp/soxout.wav' sf.write(new_sounds_fn, audio_data, SAMPLING_RATE) if not os.path.exists(existing_snd_fn): os.system('mv %s %s'%(new_sounds_fn, existing_snd_fn)) return sox_command = '/usr/local/bin/sox %s %s %s'%(existing_snd_fn, new_sounds_fn, sox_output_fn) os.system(sox_command) os.system('mv %s %s'%(sox_output_fn, existing_snd_fn)) def patch_voice_clips_from_TTS_lib(clips): # clips = list of filename wo .wav ['23', 'heure'] sounds = [sf.read('%s%s.wav'%(VOICE_CLIPS_REP, clip)) for clip in clips] sounds = [np.array(clip[0]) for clip in sounds] return np.concatenate(sounds) sync_bip = gen_sinus(AMPL, SYNC_BIP_FREQUENCY, SYNC_BIP_LENGTH) os.system('rm %s'%TEST_FILE) pseudo_now = datetime.datetime.now() for i_bip in range(20): # number of spoken times in the output file silence_bef_voice = np.zeros(int(INTERVAL_BEFORE_VOICE*SAMPLING_RATE)) time_strings = \ [pseudo_now.hour, hh, pseudo_now.minute, mm, pseudo_now.second, ss] voice_data = patch_voice_clips_from_TTS_lib(time_strings) voice_duration = len(voice_data) # in samples silence_length_after_voice = INTERVAL_BETWEEN_BIPS - \ voice_duration/SAMPLING_RATE -\ INTERVAL_BEFORE_VOICE -\ len(sync_bip)/SAMPLING_RATE silence_after_voice = np.zeros(int(silence_length_after_voice*SAMPLING_RATE)) append_data_to_WAV(sync_bip, TEST_FILE) append_data_to_WAV(silence_bef_voice, TEST_FILE) append_data_to_WAV(voice_data, TEST_FILE) append_data_to_WAV(silence_after_voice, TEST_FILE) pseudo_now = pseudo_now + datetime.timedelta(seconds=INTERVAL_BETWEEN_BIPS)
Comments
Post a Comment