Skip to content

Commit

Permalink
Fixed channels issue on GetAvgSamples
Browse files Browse the repository at this point in the history
  • Loading branch information
sandrohanea committed Jun 10, 2023
1 parent a6ef626 commit 388cd32
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"dotnet.defaultSolution": "Whisper.net.sln"
}
2 changes: 1 addition & 1 deletion Whisper.net/Wave/WaveParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ public float[] GetAvgSamples()
sampleSum += reader.ReadInt16();
}

samples[i] = (sampleSum / 4) / 32768.0f;
samples[i] = (sampleSum / 32768.0f) / channels;
}

return samples;
Expand Down
23 changes: 23 additions & 0 deletions examples/ContinuousRecognition/ContinuousRecognition.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Whisper.net.Runtime" Version="1.4.3" />
</ItemGroup>

<ItemGroup>
<None Include="../TestData/bush.wav">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Whisper.net\Whisper.net.csproj" />
</ItemGroup>
</Project>
144 changes: 144 additions & 0 deletions examples/ContinuousRecognition/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Licensed under the MIT license: https://opensource.org/licenses/MIT

using System;
using Whisper.net;
using Whisper.net.Ggml;
using Whisper.net.Wave;

var ggmlType = GgmlType.TinyEn;
var modelFileName = "ggml-tinyen.bin";
var wavFileName = "bush.wav";

var maxProcessingTimeMs = 10000;
var minProcessingTimeMs = 1500;
var advancingProcessingTimeMs = 500;

if (!File.Exists(modelFileName))
{
await DownloadModel(modelFileName, ggmlType);
}

using var whisperFactory = WhisperFactory.FromPath(modelFileName);

var builder = whisperFactory.CreateBuilder()
.WithProbabilities()
.WithLanguage("en");

using var fileStream = File.OpenRead(wavFileName);
var waveParser = new WaveParser(fileStream);
await waveParser.InitializeAsync();

var samples = new float[waveParser.SampleRate / 1000 * maxProcessingTimeMs];

// Process first the minimum processing time of the audio file

// Read first min processing time into samples
var dataPosition = waveParser.DataChunkPosition;

fileStream.Seek(dataPosition, SeekOrigin.Begin);

var partialResults = new List<(List<SegmentData> segments, TimeSpan startTime, TimeSpan endTime)>();
var buffer = new byte[waveParser.SampleRate / 1000 * maxProcessingTimeMs * 2 * waveParser.Channels];

var bufferSize = waveParser.SampleRate / 1000 * minProcessingTimeMs * 2 * waveParser.Channels;

var bytesRead = await fileStream.ReadAsync(buffer.AsMemory(0, (int)bufferSize));

var currentSampleIndex = 0;

for (var i = 0; i < bytesRead;)
{
long sampleSum = 0;

for (var currentChannel = 0; currentChannel < waveParser.Channels; currentChannel++)
{
sampleSum += BitConverter.ToInt16(buffer, i);
i += 2;
}

samples[currentSampleIndex++] = sampleSum / (float)waveParser.Channels / 32768.0f;
}

var currentProcessedStartTime = TimeSpan.Zero;
var currentProcessedEndTime = TimeSpan.FromMilliseconds(minProcessingTimeMs);

await using (var processor = builder.Build())
{
var segments = new List<SegmentData>();
await foreach (var data in processor.ProcessAsync(samples.AsMemory(0, currentSampleIndex)))
{
segments.Add(data);

}
partialResults.Add((segments, currentProcessedStartTime, currentProcessedEndTime));
}

var fullText = string.Empty;

while (currentSampleIndex < waveParser.SamplesCount)
{
bufferSize = waveParser.SampleRate / 1000 * advancingProcessingTimeMs * 2 * waveParser.Channels;

bytesRead = await fileStream.ReadAsync(buffer.AsMemory(0, (int)bufferSize));
for (var i = 0; i < bytesRead;)
{
long sampleSum = 0;

for (var currentChannel = 0; currentChannel < waveParser.Channels; currentChannel++)
{
sampleSum += BitConverter.ToInt16(buffer, i);
i += 2;
}

samples[currentSampleIndex++] = sampleSum / (float)waveParser.Channels / 32768.0f;
}

currentProcessedEndTime = currentProcessedEndTime.Add(TimeSpan.FromMilliseconds(advancingProcessingTimeMs));

await using (var processor = builder.Build())
{
var segments = new List<SegmentData>();
await foreach (var data in processor.ProcessAsync(samples.AsMemory(0, currentSampleIndex)))
{
segments.Add(data);
}
partialResults.Add((segments, currentProcessedStartTime, currentProcessedEndTime));

var indexSegment = 0;
foreach (var segment in segments)
{
Console.WriteLine($"{indexSegment}: {segment.Start}->{segment.End}: {segment.Text} => with probability: {segment.Probability}");
indexSegment++;
}
}

var indexPartial = 0;
//TODO: Check if partials concluded to one finished segment and return it.
foreach (var partial in partialResults)
{
// Console.WriteLine(indexPartial + ":" + partial.startTime + " - " + partial.endTime + " " + partial.segments.Count + " segments\n-----------");
indexPartial++;
// If one segment is identified. E.g. "My fellow Americans" from second 0 to second 3 => we remove that part from the samples, adding the text to the prompt and continue processing the rest of the samples.
}

// If the total current processing time is reaching max processing time => we remove half of the samples and continue processing the rest of the samples.
if (currentProcessedEndTime.TotalMilliseconds - currentProcessedStartTime.TotalMilliseconds >= maxProcessingTimeMs)
{
// First, we copy the last part of the samples to the beginning of the array
var samplesToCopy = currentSampleIndex - maxProcessingTimeMs / 2;
for (var i = 0; i < samplesToCopy; i++)
{
samples[i] = samples[i + maxProcessingTimeMs / 2];
}
currentProcessedStartTime = currentProcessedStartTime.Add(TimeSpan.FromMilliseconds(maxProcessingTimeMs / 2));
currentSampleIndex = samplesToCopy;
}
}

static async Task DownloadModel(string fileName, GgmlType ggmlType)
{
Console.WriteLine($"Downloading Model {fileName}");
using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(ggmlType);
using var fileWriter = File.OpenWrite(fileName);
await modelStream.CopyToAsync(fileWriter);
}
Binary file added examples/TestData/bush.wav
Binary file not shown.
16 changes: 14 additions & 2 deletions examples/Whisper.net.Examples.sln
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SimpleSync", "SimpleSync\Si
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Diarization", "Diarization\Diarization.csproj", "{0B8C62B9-9607-4004-8ECD-6DEA70598EC9}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NAudioMp3", "NAudioMp3\NAudioMp3.csproj", "{2A95B4A4-A93D-40F4-A5F2-139C5D7260F0}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NAudioMp3", "NAudioMp3\NAudioMp3.csproj", "{2A95B4A4-A93D-40F4-A5F2-139C5D7260F0}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NAudioResampleWav", "NAudioResampleWav\NAudioResampleWav.csproj", "{73DB5111-7DC7-4396-8993-0BB7EBB793CE}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NAudioResampleWav", "NAudioResampleWav\NAudioResampleWav.csproj", "{73DB5111-7DC7-4396-8993-0BB7EBB793CE}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CoreML", "CoreML\CoreML.csproj", "{9955466B-E5CA-4971-B52F-D0EBD8D856AC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ContinuousRecognition", "ContinuousRecognition\ContinuousRecognition.csproj", "{95FE1AC5-F46C-4E43-A1F5-290F93CB9B6B}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Whisper.net", "..\Whisper.net\Whisper.net.csproj", "{A5A05A97-D880-4F87-BCE3-1A5420846F4D}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -45,6 +49,14 @@ Global
{9955466B-E5CA-4971-B52F-D0EBD8D856AC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{9955466B-E5CA-4971-B52F-D0EBD8D856AC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{9955466B-E5CA-4971-B52F-D0EBD8D856AC}.Release|Any CPU.Build.0 = Release|Any CPU
{95FE1AC5-F46C-4E43-A1F5-290F93CB9B6B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{95FE1AC5-F46C-4E43-A1F5-290F93CB9B6B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{95FE1AC5-F46C-4E43-A1F5-290F93CB9B6B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{95FE1AC5-F46C-4E43-A1F5-290F93CB9B6B}.Release|Any CPU.Build.0 = Release|Any CPU
{A5A05A97-D880-4F87-BCE3-1A5420846F4D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A5A05A97-D880-4F87-BCE3-1A5420846F4D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A5A05A97-D880-4F87-BCE3-1A5420846F4D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A5A05A97-D880-4F87-BCE3-1A5420846F4D}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down

0 comments on commit 388cd32

Please sign in to comment.