infill/bytes endpoint (#67)

Co-authored-by: Noah Tye <hi@noahtye.com>
cartesia-ai · Jan 22, 2025 · 7c22e7e · 7c22e7e
1 parent 0216c79
commit 7c22e7e
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 1 deletion.
diff --git a/fern/definition/infill.yml b/fern/definition/infill.yml
@@ -0,0 +1,84 @@
+imports:
+  tts: ./tts.yml
+  voice_changer: ./voice-changer.yml
+
+service:
+  base-path: /infill
+  auth: true
+  endpoints:
+    bytes:
+      path: /bytes
+      method: POST
+      display-name: Infill (Bytes)
+      docs: |
+        Generate audio that smoothly connects two existing audio segments. This is useful for inserting new speech between existing speech segments while maintaining natural transitions.
+
+        At least one of `left_audio` or `right_audio` must be provided.
+      request:
+        name: InfillBytesRequest
+        body:
+          properties:
+            left_audio:
+              type: file
+            right_audio:
+              type: file
+            model_id[]:
+              type: string
+              docs: The ID of the model to use for generating audio
+            language[]:
+              type: string
+              docs: The language of the transcript
+            transcript[]:
+              type: string
+              docs: The infill text to generate
+            voice[id]:
+              type: string
+              docs: The ID of the voice to use for generating audio
+            output_format[container]:
+              type: voice_changer.OutputFormatContainer
+              docs: The format of the output audio
+            output_format[sample_rate]:
+              type: integer
+              docs: The sample rate of the output audio
+            output_format[encoding]:
+              type: optional<tts.RawEncoding>
+              docs: |
+                Required for `raw` and `wav` containers.
+            output_format[bit_rate]:
+              type: optional<integer>
+              docs: |
+                Required for `mp3` containers.
+            voice[__experimental_controls][speed]:
+              type: optional<tts.Speed>
+              docs: |
+                Either a number between -1.0 and 1.0 or a natural language description of speed.
+
+                If you specify a number, 0.0 is the default speed, -1.0 is the slowest speed, and 1.0 is the fastest speed.
+            voice[__experimental_controls][emotion][]:
+              type: optional<tts.Emotion>
+              docs: |
+                An array of emotion:level tags.
+
+                Supported emotions are: anger, positivity, surprise, sadness, and curiosity.
+
+                Supported levels are: lowest, low, (omit), high, highest.
+      response: file
+      examples:
+        - name: MP3
+          request:
+            model_id[]: sonic-english
+            language[]: en
+            transcript[]: middle segment
+            voice[id]: 694f9389-aac1-45b6-b726-9d9369183238
+            output_format[container]: mp3
+            output_format[sample_rate]: 44100
+            output_format[bit_rate]: 128000
+        - name: WAV
+          request:
+            model_id[]: sonic-english
+            language[]: en
+            transcript[]: middle segment
+            voice[id]: 694f9389-aac1-45b6-b726-9d9369183238
+            output_format[container]: wav
+            output_format[sample_rate]: 44100
+            output_format[encoding]: pcm_f32le
diff --git a/fern/generators.yml b/fern/generators.yml
@@ -40,6 +40,6 @@ groups:
             emittery: "^0.13.1"
             human-id: "^4.1.1"
             ws: "^8.15.13"
-          extraDevDependencies: 
+          extraDevDependencies:
             "@types/ws": "^8.5.13"
         smart-casing: true