| OLD | NEW | 
|    1 /* |    1 /* | 
|    2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |    2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. | 
|    3  * |    3  * | 
|    4  *  Use of this source code is governed by a BSD-style license |    4  *  Use of this source code is governed by a BSD-style license | 
|    5  *  that can be found in the LICENSE file in the root of the source |    5  *  that can be found in the LICENSE file in the root of the source | 
|    6  *  tree. An additional intellectual property rights grant can be found |    6  *  tree. An additional intellectual property rights grant can be found | 
|    7  *  in the file PATENTS.  All contributing project authors may |    7  *  in the file PATENTS.  All contributing project authors may | 
|    8  *  be found in the AUTHORS file in the root of the source tree. |    8  *  be found in the AUTHORS file in the root of the source tree. | 
|    9  */ |    9  */ | 
|   10  |   10  | 
|   11 // |   11 // | 
|   12 //  Specifies core class for intelligbility enhancement. |   12 //  Specifies core class for intelligbility enhancement. | 
|   13 // |   13 // | 
|   14  |   14  | 
|   15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER
     _H_ |   15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER
     _H_ | 
|   16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER
     _H_ |   16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER
     _H_ | 
|   17  |   17  | 
|   18 #include <complex> |   18 #include <complex> | 
|   19 #include <vector> |   19 #include <vector> | 
|   20  |   20  | 
|   21 #include "webrtc/base/scoped_ptr.h" |   21 #include "webrtc/base/scoped_ptr.h" | 
|   22 #include "webrtc/common_audio/lapped_transform.h" |   22 #include "webrtc/common_audio/lapped_transform.h" | 
 |   23 #include "webrtc/common_audio/channel_buffer.h" | 
|   23 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.
     h" |   24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.
     h" | 
|   24  |   25  | 
|   25 struct WebRtcVadInst; |  | 
|   26 typedef struct WebRtcVadInst VadInst; |  | 
|   27  |  | 
|   28 namespace webrtc { |   26 namespace webrtc { | 
|   29  |   27  | 
|   30 // Speech intelligibility enhancement module. Reads render and capture |   28 // Speech intelligibility enhancement module. Reads render and capture | 
|   31 // audio streams and modifies the render stream with a set of gains per |   29 // audio streams and modifies the render stream with a set of gains per | 
|   32 // frequency bin to enhance speech against the noise background. |   30 // frequency bin to enhance speech against the noise background. | 
|   33 // Note: assumes speech and noise streams are already separated. |   31 // Note: assumes speech and noise streams are already separated. | 
|   34 class IntelligibilityEnhancer { |   32 class IntelligibilityEnhancer { | 
|   35  public: |   33  public: | 
|   36   // Construct a new instance with the given filter bank resolution, |   34   struct Config { | 
|   37   // sampling rate, number of channels and analysis rates. |   35     // |var_*| are parameters for the VarianceArray constructor for the | 
|   38   // |analysis_rate| sets the number of input blocks (containing speech!) |   36     // clear speech stream. | 
|   39   // to elapse before a new gain computation is made. |variance_rate| specifies |   37     // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should | 
|   40   // the number of gain recomputations after which the variances are reset. |   38     // probably go away once fine tuning is done. | 
|   41   // |cv_*| are parameters for the VarianceArray constructor for the |   39     Config() | 
|   42   // clear speech stream. |   40         : sample_rate_hz(16000), | 
|   43   // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should |   41           num_capture_channels(1), | 
|   44   // probably go away once fine tuning is done. They override the internal |   42           num_render_channels(1), | 
|   45   // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate). |   43           var_type(intelligibility::VarianceArray::kStepDecaying), | 
|   46   IntelligibilityEnhancer(int erb_resolution, |   44           var_decay_rate(0.9f), | 
|   47                           int sample_rate_hz, |   45           var_window_size(10), | 
|   48                           int channels, |   46           analysis_rate(800), | 
|   49                           int cv_type, |   47           gain_change_limit(0.1f), | 
|   50                           float cv_alpha, |   48           rho(0.02f) {} | 
|   51                           int cv_win, |   49     int sample_rate_hz; | 
|   52                           int analysis_rate, |   50     int num_capture_channels; | 
|   53                           int variance_rate, |   51     int num_render_channels; | 
|   54                           float gain_limit); |   52     intelligibility::VarianceArray::StepType var_type; | 
|   55   ~IntelligibilityEnhancer(); |   53     float var_decay_rate; | 
 |   54     int var_window_size; | 
 |   55     int analysis_rate; | 
 |   56     float gain_change_limit; | 
 |   57     float rho; | 
 |   58   }; | 
 |   59  | 
 |   60   explicit IntelligibilityEnhancer(const Config& config); | 
 |   61   IntelligibilityEnhancer();  // Initialize with default config. | 
|   56  |   62  | 
|   57   // Reads and processes chunk of noise stream in time domain. |   63   // Reads and processes chunk of noise stream in time domain. | 
|   58   void ProcessCaptureAudio(float* const* audio); |   64   void AnalyzeCaptureAudio(float* const* audio, | 
 |   65                            int sample_rate_hz, | 
 |   66                            int num_channels); | 
|   59  |   67  | 
|   60   // Reads chunk of speech in time domain and updates with modified signal. |   68   // Reads chunk of speech in time domain and updates with modified signal. | 
|   61   void ProcessRenderAudio(float* const* audio); |   69   void ProcessRenderAudio(float* const* audio, | 
 |   70                           int sample_rate_hz, | 
 |   71                           int num_channels); | 
 |   72   bool active() const; | 
|   62  |   73  | 
|   63  private: |   74  private: | 
|   64   enum AudioSource { |   75   enum AudioSource { | 
|   65     kRenderStream = 0,  // Clear speech stream. |   76     kRenderStream = 0,  // Clear speech stream. | 
|   66     kCaptureStream,  // Noise stream. |   77     kCaptureStream,  // Noise stream. | 
|   67   }; |   78   }; | 
|   68  |   79  | 
|   69   // Provides access point to the frequency domain. |   80   // Provides access point to the frequency domain. | 
|   70   class TransformCallback : public LappedTransform::Callback { |   81   class TransformCallback : public LappedTransform::Callback { | 
|   71    public: |   82    public: | 
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
|  126  |  137  | 
|  127   // Returns dot product of vectors specified by size |length| arrays |a|,|b|. |  138   // Returns dot product of vectors specified by size |length| arrays |a|,|b|. | 
|  128   static float DotProduct(const float* a, const float* b, int length); |  139   static float DotProduct(const float* a, const float* b, int length); | 
|  129  |  140  | 
|  130   const int freqs_;         // Num frequencies in frequency domain. |  141   const int freqs_;         // Num frequencies in frequency domain. | 
|  131   const int window_size_;   // Window size in samples; also the block size. |  142   const int window_size_;   // Window size in samples; also the block size. | 
|  132   const int chunk_length_;  // Chunk size in samples. |  143   const int chunk_length_;  // Chunk size in samples. | 
|  133   const int bank_size_;     // Num ERB filters. |  144   const int bank_size_;     // Num ERB filters. | 
|  134   const int sample_rate_hz_; |  145   const int sample_rate_hz_; | 
|  135   const int erb_resolution_; |  146   const int erb_resolution_; | 
|  136   const int channels_;       // Num channels. |  147   const int num_capture_channels_; | 
 |  148   const int num_render_channels_; | 
|  137   const int analysis_rate_;  // Num blocks before gains recalculated. |  149   const int analysis_rate_;  // Num blocks before gains recalculated. | 
|  138   const int variance_rate_;  // Num recalculations before history is cleared. |  150  | 
 |  151   const bool active_;  // Whether render gains are being updated. | 
 |  152                        // TODO(ekm): Add logic for updating |active_|. | 
|  139  |  153  | 
|  140   intelligibility::VarianceArray clear_variance_; |  154   intelligibility::VarianceArray clear_variance_; | 
|  141   intelligibility::VarianceArray noise_variance_; |  155   intelligibility::VarianceArray noise_variance_; | 
|  142   rtc::scoped_ptr<float[]> filtered_clear_var_; |  156   rtc::scoped_ptr<float[]> filtered_clear_var_; | 
|  143   rtc::scoped_ptr<float[]> filtered_noise_var_; |  157   rtc::scoped_ptr<float[]> filtered_noise_var_; | 
|  144   std::vector<std::vector<float>> filter_bank_; |  158   std::vector<std::vector<float>> filter_bank_; | 
|  145   rtc::scoped_ptr<float[]> center_freqs_; |  159   rtc::scoped_ptr<float[]> center_freqs_; | 
|  146   int start_freq_; |  160   int start_freq_; | 
|  147   rtc::scoped_ptr<float[]> rho_;  // Production and interpretation SNR. |  161   rtc::scoped_ptr<float[]> rho_;  // Production and interpretation SNR. | 
|  148                                   // for each ERB band. |  162                                   // for each ERB band. | 
|  149   rtc::scoped_ptr<float[]> gains_eq_;  // Pre-filter modified gains. |  163   rtc::scoped_ptr<float[]> gains_eq_;  // Pre-filter modified gains. | 
|  150   intelligibility::GainApplier gain_applier_; |  164   intelligibility::GainApplier gain_applier_; | 
|  151  |  165  | 
|  152   // Destination buffer used to reassemble blocked chunks before overwriting |  166   // Destination buffers used to reassemble blocked chunks before overwriting | 
|  153   // the original input array with modifications. |  167   // the original input array with modifications. | 
|  154   // TODO(ekmeyerson): Switch to using ChannelBuffer. |  168   ChannelBuffer<float> temp_render_out_buffer_; | 
|  155   float** temp_out_buffer_; |  169   ChannelBuffer<float> temp_capture_out_buffer_; | 
|  156  |  170  | 
|  157   rtc::scoped_ptr<float* []> input_audio_; |  | 
|  158   rtc::scoped_ptr<float[]> kbd_window_; |  171   rtc::scoped_ptr<float[]> kbd_window_; | 
|  159   TransformCallback render_callback_; |  172   TransformCallback render_callback_; | 
|  160   TransformCallback capture_callback_; |  173   TransformCallback capture_callback_; | 
|  161   rtc::scoped_ptr<LappedTransform> render_mangler_; |  174   rtc::scoped_ptr<LappedTransform> render_mangler_; | 
|  162   rtc::scoped_ptr<LappedTransform> capture_mangler_; |  175   rtc::scoped_ptr<LappedTransform> capture_mangler_; | 
|  163   int block_count_; |  176   int block_count_; | 
|  164   int analysis_step_; |  177   int analysis_step_; | 
|  165  |  | 
|  166   // TODO(bercic): Quick stopgap measure for voice detection in the clear |  | 
|  167   // and noise streams. |  | 
|  168   // Note: VAD currently does not affect anything in IntelligibilityEnhancer. |  | 
|  169   VadInst* vad_high_; |  | 
|  170   VadInst* vad_low_; |  | 
|  171   rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_; |  | 
|  172   bool has_voice_low_;  // Whether voice detected in speech stream. |  | 
|  173 }; |  178 }; | 
|  174  |  179  | 
|  175 }  // namespace webrtc |  180 }  // namespace webrtc | 
|  176  |  181  | 
|  177 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN
     CER_H_ |  182 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHAN
     CER_H_ | 
| OLD | NEW |