sync : whisper.cpp (ARM 32-bit, abort callback, wav_writer, etc.) (gg…

…erganov#602)
gartia · Nov 3, 2023 · 239defe · 239defe
1 parent ff90433
commit 239defe
Show file tree

Hide file tree

Showing 9 changed files with 419 additions and 120 deletions.
diff --git a/examples/common.h b/examples/common.h
@@ -7,6 +7,8 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <ctime>
+#include <fstream>
 
 #define COMMON_SAMPLE_RATE 16000
 
@@ -142,6 +144,104 @@ bool read_wav(
         std::vector<std::vector<float>> & pcmf32s,
         bool stereo);
 
+// Write PCM data into WAV audio file
+class wav_writer {
+private:
+    std::ofstream file;
+    uint32_t dataSize = 0;
+    std::string wav_filename;
+
+    bool write_header(const uint32_t sample_rate,
+                      const uint16_t bits_per_sample,
+                      const uint16_t channels) {
+
+        file.write("RIFF", 4);
+        file.write("\0\0\0\0", 4);    // Placeholder for file size
+        file.write("WAVE", 4);
+        file.write("fmt ", 4);
+
+        const uint32_t sub_chunk_size = 16;
+        const uint16_t audio_format = 1;      // PCM format
+        const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
+        const uint16_t block_align = channels * bits_per_sample / 8;
+
+        file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
+        file.write(reinterpret_cast<const char *>(&audio_format), 2);
+        file.write(reinterpret_cast<const char *>(&channels), 2);
+        file.write(reinterpret_cast<const char *>(&sample_rate), 4);
+        file.write(reinterpret_cast<const char *>(&byte_rate), 4);
+        file.write(reinterpret_cast<const char *>(&block_align), 2);
+        file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
+        file.write("data", 4);
+        file.write("\0\0\0\0", 4);    // Placeholder for data size
+
+        return true;
+    }
+
+    // It is assumed that PCM data is normalized to a range from -1 to 1
+    bool write_audio(const float * data, size_t length) {
+        for (size_t i = 0; i < length; ++i) {
+            const auto intSample = static_cast<const int16_t>(data[i] * 32767);
+            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
+            dataSize += sizeof(int16_t);
+        }
+        if (file.is_open()) {
+            file.seekp(4, std::ios::beg);
+            uint32_t fileSize = 36 + dataSize;
+            file.write(reinterpret_cast<char *>(&fileSize), 4);
+            file.seekp(40, std::ios::beg);
+            file.write(reinterpret_cast<char *>(&dataSize), 4);
+            file.seekp(0, std::ios::end);
+        }
+        return true;
+    }
+
+    bool open_wav(const std::string & filename) {
+        if (filename != wav_filename) {
+            if (file.is_open()) {
+                file.close();
+            }
+        }
+        if (!file.is_open()) {
+            file.open(filename, std::ios::binary);
+            wav_filename = filename;
+            dataSize = 0;
+        }
+        return file.is_open();
+    }
+
+public:
+    bool open(const std::string & filename,
+              const    uint32_t   sample_rate,
+              const    uint16_t   bits_per_sample,
+              const    uint16_t   channels) {
+
+        if (open_wav(filename)) {
+            write_header(sample_rate, bits_per_sample, channels);
+        } else {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool close() {
+        file.close();
+        return true;
+    }
+
+    bool write(const float * data, size_t length) {
+        return write_audio(data, length);
+    }
+
+    ~wav_writer() {
+        if (file.is_open()) {
+            file.close();
+        }
+    }
+};
+
+
 // Apply a high-pass frequency filter to PCM audio
 // Suppresses frequencies below cutoff Hz
 void high_pass_filter(

diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp
@@ -83,6 +83,7 @@ struct whisper_params {
     bool output_wts      = false;
     bool output_csv      = false;
     bool output_jsn      = false;
+    bool output_jsn_full = false;
     bool output_lrc      = false;
     bool print_special   = false;
     bool print_colors    = false;
@@ -151,6 +152,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
         else if (arg == "-ocsv" || arg == "--output-csv")      { params.output_csv      = true; }
         else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
+        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
         else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
         else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
         else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
@@ -206,6 +208,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
     fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
     fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
+    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
     fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
     fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
     fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@@ -511,7 +514,12 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
     return true;
 }
 
-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_json(
+             struct whisper_context * ctx,
+                         const char * fname,
+               const whisper_params & params,
+    std::vector<std::vector<float>>   pcmf32s,
+                               bool   full) {
     std::ofstream fout(fname);
     int indent = 0;
 
@@ -528,7 +536,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
     auto end_arr = [&](bool end) {
         indent--;
         doindent();
-        fout << (end ? "]\n" : "},\n");
+        fout << (end ? "]\n" : "],\n");
     };
 
     auto start_obj = [&](const char *name) {
@@ -569,12 +577,29 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
         end_value(end);
     };
 
+    auto value_f = [&](const char *name, const float val, bool end) {
+        start_value(name);
+        fout << val;
+        end_value(end);
+    };
+
     auto value_b = [&](const char *name, const bool val, bool end) {
         start_value(name);
         fout << (val ? "true" : "false");
         end_value(end);
     };
 
+    auto times_o = [&](int64_t t0, int64_t t1, bool end) {
+        start_obj("timestamps");
+        value_s("from", to_timestamp(t0, true).c_str(), false);
+        value_s("to", to_timestamp(t1, true).c_str(), true);
+        end_obj(false);
+        start_obj("offsets");
+        value_i("from", t0 * 10, false);
+        value_i("to", t1 * 10, true);
+        end_obj(end);
+    };
+
     if (!fout.is_open()) {
         fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
         return false;
@@ -620,15 +645,26 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                 const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
 
                 start_obj(nullptr);
-                    start_obj("timestamps");
-                        value_s("from", to_timestamp(t0, true).c_str(), false);
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj(false);
-                    start_obj("offsets");
-                        value_i("from", t0 * 10, false);
-                        value_i("to", t1 * 10, true);
-                    end_obj(false);
-                    value_s("text", text, !params.diarize && !params.tinydiarize);
+                    times_o(t0, t1, false);
+                    value_s("text", text, !params.diarize && !params.tinydiarize && !full);
+
+                    if (full) {
+                        start_arr("tokens");
+                        const int n = whisper_full_n_tokens(ctx, i);
+                        for (int j = 0; j < n; ++j) {
+                            auto token = whisper_full_get_token_data(ctx, i, j);
+                            start_obj(nullptr);
+                                value_s("text", whisper_token_to_str(ctx, token.id), false);
+                                if(token.t0 > -1 && token.t1 > -1) {
+                                    // If we have per-token timestamps, write them out
+                                    times_o(token.t0, token.t1, false);
+                                }
+                                value_i("id", token.id, false);
+                                value_f("p", token.p, true);
+                            end_obj(j == (n - 1));
+                        }
+                        end_arr(!params.diarize && !params.tinydiarize);
+                    }
 
                     if (params.diarize && pcmf32s.size() == 2) {
                         value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
@@ -912,7 +948,7 @@ int main(int argc, char ** argv) {
             wparams.offset_ms        = params.offset_t_ms;
             wparams.duration_ms      = params.duration_ms;
 
-            wparams.token_timestamps = params.output_wts || params.max_len > 0;
+            wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
             wparams.thold_pt         = params.word_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.split_on_word    = params.split_on_word;
@@ -944,8 +980,9 @@ int main(int argc, char ** argv) {
                 wparams.progress_callback_user_data = &user_data;
             }
 
-            // example for abort mechanism
-            // in this example, we do not abort the processing, but we could if the flag is set to true
+            // examples for abort mechanism
+            // in examples below, we do not abort the processing, but we could if the flag is set to true
+
             // the callback is called before every encoder run - if it returns false, the processing is aborted
             {
                 static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
@@ -957,6 +994,17 @@ int main(int argc, char ** argv) {
                 wparams.encoder_begin_callback_user_data = &is_aborted;
             }
 
+            // the callback is called before every computation - if it returns true, the computation is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.abort_callback = [](void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return is_aborted;
+                };
+                wparams.abort_callback_user_data = &is_aborted;
+            }
+
             if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 10;
@@ -1000,7 +1048,7 @@ int main(int argc, char ** argv) {
             // output to JSON file
             if (params.output_jsn) {
                 const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
+                output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
             }
 
             // output to LRC file