|
34 | 34 | % http://zafarrafii.com
|
35 | 35 | % https://github.com/zafarrafii
|
36 | 36 | % https://www.linkedin.com/in/zafarrafii/
|
37 |
| - % 01/26/21 |
| 37 | + % 01/27/21 |
38 | 38 |
|
39 | 39 | % Define the properties
|
40 | 40 | properties (Access = private, Constant = true)
|
|
713 | 713 | % Get the number of samples and channels
|
714 | 714 | [number_samples,number_channels] = size(audio_signal);
|
715 | 715 |
|
716 |
| - % Window length, window function, and step length for the STFT |
717 |
| - window_length = repet.windowlength(sampling_frequency); |
718 |
| - window_function = repet.windowfunction(window_length); |
719 |
| - step_length = repet.steplength(window_length); |
| 716 | + % Set the parameters for the STFT |
| 717 | + % (audio stationary around 40 ms, power of 2 for fast FFT and constant overlap-add (COLA), |
| 718 | + % periodic Hamming window for COLA, and step equal to half the window length for COLA) |
| 719 | + window_length = 2^nextpow2(0.04*sampling_frequency); |
| 720 | + window_function = hamming(window_length,'periodic'); |
| 721 | + step_length = window_length/2; |
720 | 722 |
|
721 |
| - % Number of time frames |
| 723 | + % Derive the number of time frames |
722 | 724 | number_times = ceil((number_samples-window_length)/step_length+1);
|
723 | 725 |
|
724 |
| - % Buffer length in time frames |
725 |
| - buffer_length = round((repet.buffer_length*sampling_frequency-window_length)/step_length+1); |
| 726 | + % Derive the number of frequency channels |
| 727 | + number_frequencies = window_length/2+1; |
726 | 728 |
|
727 |
| - % Initialize the buffer spectrogram |
728 |
| - buffer_spectrogram = zeros(window_length/2+1,buffer_length,number_channels); |
| 729 | + % Get the buffer length in time frames |
| 730 | + buffer_length2 = round((repet.buffer_length*sampling_frequency)/step_length); |
729 | 731 |
|
730 |
| - % Open the wait bar |
731 |
| - wait_bar = waitbar(0,'Online REPET-SIM'); |
| 732 | + % Initialize the buffer spectrogram |
| 733 | + buffer_spectrogram = zeros(number_frequencies,buffer_length2,number_channels); |
732 | 734 |
|
733 | 735 | % Loop over the time frames to compute the buffer spectrogram
|
734 | 736 | % (the last frame will be the frame to be processed)
|
735 |
| - for time_index = 1:buffer_length-1 |
736 |
| - |
737 |
| - % Sample index in the signal |
738 |
| - sample_index = step_length*(time_index-1); |
| 737 | + k = 0; |
| 738 | + for j = 1:buffer_length2-1 |
739 | 739 |
|
740 | 740 | % Loop over the channels
|
741 |
| - for channel_index = 1:number_channels |
| 741 | + for i = 1:number_channels |
742 | 742 |
|
743 | 743 | % Compute the FT of the segment
|
744 |
| - buffer_ft = fft(audio_signal(1+sample_index:window_length+sample_index,channel_index).*window_function); |
| 744 | + buffer_ft = fft(audio_signal(k+1:k+window_length,i).*window_function); |
745 | 745 |
|
746 | 746 | % Derive the spectrum of the frame
|
747 |
| - buffer_spectrogram(:,time_index,channel_index) = abs(buffer_ft(1:window_length/2+1)); |
| 747 | + buffer_spectrogram(:,j,i) = abs(buffer_ft(1:number_frequencies)); |
748 | 748 |
|
749 | 749 | end
|
750 | 750 |
|
751 |
| - % Update the wait bar |
752 |
| - waitbar(time_index/number_times,wait_bar); |
| 751 | + % Update the index |
| 752 | + k = k+step_length; |
753 | 753 |
|
754 | 754 | end
|
755 | 755 |
|
756 | 756 | % Zero-pad the audio signal at the end
|
757 | 757 | audio_signal = [audio_signal;zeros((number_times-1)*step_length+window_length-number_samples,number_channels)];
|
758 | 758 |
|
759 |
| - % Similarity distance in time frames |
760 |
| - similarity_distance = round(repet.similarity_distance*sampling_frequency/step_length); |
| 759 | + % Get the similarity distance in time frames |
| 760 | + similarity_distance2 = round(repet.similarity_distance*sampling_frequency/step_length); |
761 | 761 |
|
762 |
| - % Cutoff frequency in frequency channels for the dual high-pass |
763 |
| - % filter of the foreground |
764 |
| - cutoff_frequency = ceil(repet.cutoff_frequency*(window_length-1)/sampling_frequency); |
| 762 | + % Get the cutoff frequency in frequency channels |
| 763 | + % for the dual high-pass filter of the foreground |
| 764 | + cutoff_frequency2 = ceil(repet.cutoff_frequency*window_length/sampling_frequency); |
765 | 765 |
|
766 | 766 | % Initialize the background signal
|
767 | 767 | background_signal = zeros((number_times-1)*step_length+window_length,number_channels);
|
768 | 768 |
|
769 | 769 | % Loop over the time frames to compute the background signal
|
770 |
| - for time_index = buffer_length:number_times |
771 |
| - |
772 |
| - % Sample index in the signal |
773 |
| - sample_index = step_length*(time_index-1); |
| 770 | + for j = buffer_length2:number_times |
774 | 771 |
|
775 |
| - % Time index of the current frame |
776 |
| - current_index = mod(time_index-1,buffer_length)+1; |
| 772 | + % Get the time index of the current frame |
| 773 | + j0 = mod(j-1,buffer_length2)+1; |
777 | 774 |
|
778 | 775 | % Initialize the FT of the current segment
|
779 | 776 | current_ft = zeros(window_length,number_channels);
|
780 | 777 |
|
781 | 778 | % Loop over the channels
|
782 |
| - for channel_index = 1:number_channels |
| 779 | + for i = 1:number_channels |
783 | 780 |
|
784 | 781 | % Compute the FT of the current segment
|
785 |
| - current_ft(:,channel_index) ... |
786 |
| - = fft(audio_signal(1+sample_index:window_length+sample_index,channel_index).*window_function); |
| 782 | + current_ft(:,i) = fft(audio_signal(k+1:k+window_length,i).*window_function); |
787 | 783 |
|
788 |
| - % Derive the spectrum of the current frame and update |
789 |
| - % the buffer spectrogram |
790 |
| - buffer_spectrogram(:,current_index,channel_index) ... |
791 |
| - = abs(current_ft(1:window_length/2+1,channel_index)); |
| 784 | + % Derive the magnitude spectrum and update the buffer spectrogram |
| 785 | + buffer_spectrogram(:,j0,i) = abs(current_ft(1:number_frequencies,i)); |
792 | 786 |
|
793 | 787 | end
|
794 | 788 |
|
795 |
| - % Cosine similarity between the spectrum of the current |
796 |
| - % frame and the past frames, for all the channels |
| 789 | + % Compute the cosine similarity between the spectrum of the current frame |
| 790 | + % and the past frames, for all the channels |
797 | 791 | similarity_vector ...
|
798 |
| - = repet.similaritymatrix(mean(buffer_spectrogram,3),mean(buffer_spectrogram(:,current_index,:),3)); |
| 792 | + = repet.similaritymatrix(mean(buffer_spectrogram,3),mean(buffer_spectrogram(:,j0,:),3)); |
799 | 793 |
|
800 |
| - % Indices of the similar frames |
| 794 | + % Estimate the indices of the similar frames |
801 | 795 | [~,similarity_indices] ...
|
802 |
| - = repet.localmaxima(similarity_vector,repet.similarity_threshold,similarity_distance,repet.similarity_number); |
| 796 | + = repet.localmaxima(similarity_vector,repet.similarity_threshold,similarity_distance2,repet.similarity_number); |
803 | 797 |
|
804 | 798 | % Loop over the channels
|
805 |
| - for channel_index = 1:number_channels |
| 799 | + for i = 1:number_channels |
806 | 800 |
|
807 | 801 | % Compute the repeating spectrum for the current frame
|
808 |
| - repeating_spectrum = median(buffer_spectrogram(:,similarity_indices,channel_index),2); |
| 802 | + repeating_spectrum = median(buffer_spectrogram(:,similarity_indices,i),2); |
809 | 803 |
|
810 | 804 | % Refine the repeating spectrum
|
811 |
| - repeating_spectrum = min(repeating_spectrum,buffer_spectrogram(:,current_index,channel_index)); |
| 805 | + repeating_spectrum = min(repeating_spectrum,buffer_spectrogram(:,j0,i)); |
812 | 806 |
|
813 | 807 | % Derive the repeating mask for the current frame
|
814 |
| - repeating_mask = (repeating_spectrum+eps)./(buffer_spectrogram(:,current_index,channel_index)+eps); |
| 808 | + repeating_mask = (repeating_spectrum+eps)./(buffer_spectrogram(:,j0,i)+eps); |
815 | 809 |
|
816 |
| - % High-pass filtering of the (dual) non-repeating |
817 |
| - % foreground |
818 |
| - repeating_mask(2:cutoff_frequency+1,:) = 1; |
| 810 | + % Perform a high-pass filtering of the dual foreground |
| 811 | + repeating_mask(2:cutoff_frequency2+1,:) = 1; |
819 | 812 |
|
820 |
| - % Mirror the frequency channels |
| 813 | + % Recover the mirrored frequencies |
821 | 814 | repeating_mask = cat(1,repeating_mask,repeating_mask(end-1:-1:2));
|
822 | 815 |
|
823 | 816 | % Apply the mask to the FT of the current segment
|
824 |
| - background_ft = repeating_mask.*current_ft(:,channel_index); |
| 817 | + background_ft = repeating_mask.*current_ft(:,i); |
825 | 818 |
|
826 |
| - % Inverse FT of the current segment |
827 |
| - background_signal(1+sample_index:window_length+sample_index,channel_index) ... |
828 |
| - = background_signal(1+sample_index:window_length+sample_index,channel_index)+real(ifft(background_ft)); |
| 819 | + % Take the inverse FT of the current segment |
| 820 | + background_signal(k+1:k+window_length,i) ... |
| 821 | + = background_signal(k+1:k+window_length,i)+real(ifft(background_ft)); |
829 | 822 |
|
830 | 823 | end
|
831 | 824 |
|
832 |
| - % Update the wait bar |
833 |
| - waitbar(time_index/number_times,wait_bar); |
| 825 | + % Update the index |
| 826 | + k = k+step_length; |
834 | 827 |
|
835 | 828 | end
|
836 | 829 |
|
837 | 830 | % Truncate the signal to the original length
|
838 | 831 | background_signal = background_signal(1:number_samples,:);
|
839 | 832 |
|
840 |
| - % Un-window the signal (just in case) |
| 833 | + % Normalize the signal by the gain introduced by the COLA (if any) |
841 | 834 | background_signal = background_signal/sum(window_function(1:step_length:window_length));
|
842 | 835 |
|
843 |
| - % Close the wait bar |
844 |
| - close(wait_bar) |
845 |
| - |
846 | 836 | end
|
847 | 837 |
|
848 | 838 | function specshow(audio_spectrogram, number_samples, sampling_frequency, xtick_step, ytick_step)
|
|
0 commit comments