Skip to content

Commit

Permalink
AudioEncoderCng: Handle case where speech encoder is reset
Browse files Browse the repository at this point in the history
Previously, AudioEncoderCng required the speech encoder to not change
its mind regarding the number of 10 ms frames in the next packet
between calls to AudioEncoderCng::EncodeInternal()---specifically, it
could handle an upward but not a downward adjustment. With this patch,
it can handle a downward adjustment too, by simply saving the
overshoot data for the next call to EncodeInternal().

It will still not handle the case where the encoder's reported number
of 10 ms frames in the next packet is inconsistent with the behavior
of its Encode() function when called with no intervening changes to
the encoder.

R=henrik.lundin@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/53469005

Cr-Commit-Position: refs/heads/master@{#9261}
  • Loading branch information
Henrik Lundin committed May 22, 2015
1 parent f761d10 commit 367c868
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 54 deletions.
55 changes: 28 additions & 27 deletions webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ AudioEncoderCng::AudioEncoderCng(const Config& config)
: speech_encoder_(config.speech_encoder),
cng_payload_type_(config.payload_type),
num_cng_coefficients_(config.num_cng_coefficients),
first_timestamp_in_buffer_(0),
frames_in_buffer_(0),
last_frame_active_(true),
vad_(new Vad(config.vad_mode)) {
if (config.vad) {
Expand Down Expand Up @@ -115,35 +113,31 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
size_t max_encoded_bytes,
uint8_t* encoded) {
CHECK_GE(max_encoded_bytes, static_cast<size_t>(num_cng_coefficients_ + 1));
const int num_samples = SampleRateHz() / 100 * NumChannels();
if (speech_buffer_.empty()) {
CHECK_EQ(frames_in_buffer_, 0);
first_timestamp_in_buffer_ = rtp_timestamp;
}
for (int i = 0; i < num_samples; ++i) {
const size_t samples_per_10ms_frame = SamplesPer10msFrame();
CHECK_EQ(speech_buffer_.size(),
rtp_timestamps_.size() * samples_per_10ms_frame);
rtp_timestamps_.push_back(rtp_timestamp);
for (size_t i = 0; i < samples_per_10ms_frame; ++i) {
speech_buffer_.push_back(audio[i]);
}
++frames_in_buffer_;
if (frames_in_buffer_ < speech_encoder_->Num10MsFramesInNextPacket()) {
const int frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
if (rtp_timestamps_.size() < static_cast<size_t>(frames_to_encode)) {
return EncodedInfo();
}
CHECK_LE(frames_in_buffer_ * 10, kMaxFrameSizeMs)
CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
<< "Frame size cannot be larger than " << kMaxFrameSizeMs
<< " ms when using VAD/CNG.";
const size_t samples_per_10ms_frame = 10 * SampleRateHz() / 1000;
CHECK_EQ(speech_buffer_.size(),
static_cast<size_t>(frames_in_buffer_) * samples_per_10ms_frame);

// Group several 10 ms blocks per VAD call. Call VAD once or twice using the
// following split sizes:
// 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
// 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
int blocks_in_first_vad_call =
(frames_in_buffer_ > 3 ? 3 : frames_in_buffer_);
if (frames_in_buffer_ == 4)
(frames_to_encode > 3 ? 3 : frames_to_encode);
if (frames_to_encode == 4)
blocks_in_first_vad_call = 2;
const int blocks_in_second_vad_call =
frames_in_buffer_ - blocks_in_first_vad_call;
frames_to_encode - blocks_in_first_vad_call;
CHECK_GE(blocks_in_second_vad_call, 0);

// Check if all of the buffer is passive speech. Start with checking the first
Expand All @@ -161,12 +155,12 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
EncodedInfo info;
switch (activity) {
case Vad::kPassive: {
info = EncodePassive(max_encoded_bytes, encoded);
info = EncodePassive(frames_to_encode, max_encoded_bytes, encoded);
last_frame_active_ = false;
break;
}
case Vad::kActive: {
info = EncodeActive(max_encoded_bytes, encoded);
info = EncodeActive(frames_to_encode, max_encoded_bytes, encoded);
last_frame_active_ = true;
break;
}
Expand All @@ -176,20 +170,24 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodeInternal(
}
}

speech_buffer_.clear();
frames_in_buffer_ = 0;
speech_buffer_.erase(
speech_buffer_.begin(),
speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
rtp_timestamps_.erase(rtp_timestamps_.begin(),
rtp_timestamps_.begin() + frames_to_encode);
return info;
}

AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
int frames_to_encode,
size_t max_encoded_bytes,
uint8_t* encoded) {
bool force_sid = last_frame_active_;
bool output_produced = false;
const size_t samples_per_10ms_frame = SamplesPer10msFrame();
CHECK_GE(max_encoded_bytes, frames_in_buffer_ * samples_per_10ms_frame);
CHECK_GE(max_encoded_bytes, frames_to_encode * samples_per_10ms_frame);
AudioEncoder::EncodedInfo info;
for (int i = 0; i < frames_in_buffer_; ++i) {
for (int i = 0; i < frames_to_encode; ++i) {
int16_t encoded_bytes_tmp = 0;
CHECK_GE(WebRtcCng_Encode(cng_inst_.get(),
&speech_buffer_[i * samples_per_10ms_frame],
Expand All @@ -202,23 +200,26 @@ AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
force_sid = false;
}
}
info.encoded_timestamp = first_timestamp_in_buffer_;
info.encoded_timestamp = rtp_timestamps_.front();
info.payload_type = cng_payload_type_;
info.send_even_if_empty = true;
info.speech = false;
return info;
}

AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(
int frames_to_encode,
size_t max_encoded_bytes,
uint8_t* encoded) {
const size_t samples_per_10ms_frame = SamplesPer10msFrame();
AudioEncoder::EncodedInfo info;
for (int i = 0; i < frames_in_buffer_; ++i) {
for (int i = 0; i < frames_to_encode; ++i) {
info = speech_encoder_->Encode(
first_timestamp_in_buffer_, &speech_buffer_[i * samples_per_10ms_frame],
rtp_timestamps_.front(), &speech_buffer_[i * samples_per_10ms_frame],
samples_per_10ms_frame, max_encoded_bytes, encoded);
if (i < frames_in_buffer_ - 1) {
if (i == frames_to_encode - 1) {
CHECK_GT(info.encoded_bytes, 0u) << "Encoder didn't deliver data.";
} else {
CHECK_EQ(info.encoded_bytes, 0u) << "Encoder delivered data too early.";
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,21 @@ class AudioEncoderCngTest : public ::testing::Test {
timestamp_ += num_audio_samples_10ms_;
}

// Expect |num_calls| calls to the encoder, all successful. The last call
// claims to have encoded |kMockMaxEncodedBytes| bytes, and all the preceding
// ones 0 bytes.
void ExpectEncodeCalls(int num_calls) {
InSequence s;
AudioEncoder::EncodedInfo info;
for (int j = 0; j < num_calls - 1; ++j) {
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.WillOnce(Return(info));
}
info.encoded_bytes = kMockReturnEncodedBytes;
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.WillOnce(Return(info));
}

// Verifies that the cng_ object waits until it has collected
// |blocks_per_frame| blocks of audio, and then dispatches all of them to
// the underlying codec (speech or cng).
Expand All @@ -96,20 +111,8 @@ class AudioEncoderCngTest : public ::testing::Test {
Encode();
EXPECT_EQ(0u, encoded_info_.encoded_bytes);
}
if (active_speech) {
// Now expect |blocks_per_frame| calls to the encoder in sequence.
// Let the speech codec mock return true and set the number of encoded
// bytes to |kMockReturnEncodedBytes|.
InSequence s;
AudioEncoder::EncodedInfo info;
for (int j = 0; j < blocks_per_frame - 1; ++j) {
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.WillOnce(Return(info));
}
info.encoded_bytes = kMockReturnEncodedBytes;
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.WillOnce(Return(info));
}
if (active_speech)
ExpectEncodeCalls(blocks_per_frame);
Encode();
if (active_speech) {
EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes);
Expand Down Expand Up @@ -283,23 +286,17 @@ TEST_F(AudioEncoderCngTest, MixedActivePassive) {
CreateCng();

// All of the frame is active speech.
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.Times(6)
.WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
ExpectEncodeCalls(6);
EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive));
EXPECT_TRUE(encoded_info_.speech);

// First half of the frame is active speech.
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.Times(6)
.WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
ExpectEncodeCalls(6);
EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive));
EXPECT_TRUE(encoded_info_.speech);

// Second half of the frame is active speech.
EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
.Times(6)
.WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
ExpectEncodeCalls(6);
EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive));
EXPECT_TRUE(encoded_info_.speech);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,19 @@ class AudioEncoderCng final : public AudioEncoder {
inline void operator()(CNG_enc_inst* ptr) const { WebRtcCng_FreeEnc(ptr); }
};

EncodedInfo EncodePassive(size_t max_encoded_bytes, uint8_t* encoded);
EncodedInfo EncodeActive(size_t max_encoded_bytes, uint8_t* encoded);
EncodedInfo EncodePassive(int frames_to_encode,
size_t max_encoded_bytes,
uint8_t* encoded);
EncodedInfo EncodeActive(int frames_to_encode,
size_t max_encoded_bytes,
uint8_t* encoded);
size_t SamplesPer10msFrame() const;

AudioEncoder* speech_encoder_;
const int cng_payload_type_;
const int num_cng_coefficients_;
std::vector<int16_t> speech_buffer_;
uint32_t first_timestamp_in_buffer_;
int frames_in_buffer_;
std::vector<uint32_t> rtp_timestamps_;
bool last_frame_active_;
rtc::scoped_ptr<Vad> vad_;
rtc::scoped_ptr<CNG_enc_inst, CngInstDeleter> cng_inst_;
Expand Down

0 comments on commit 367c868

Please sign in to comment.