We've built text chat, RAG systems, agents, and image generation. Time to add ears and a voice to our Rails app. Today we integrate OpenAI's Whisper for speech-to-text and their TTS API for text-to-speech — with real-time playback via Turbo.
Setup
# Gemfile
gem "ruby-openai"
bundle install
rails g model Transcription user:references audio:attachment text:text status:integer language:string duration:float
rails g model SpeechSynthesis user:references input_text:text voice:string audio:attachment status:integer
rails db:migrate
Models:
# app/models/transcription.rb
class Transcription < ApplicationRecord
belongs_to :user
has_one_attached :audio
enum :status, { pending: 0, processing: 1, completed: 2, failed: 3 }
validates :audio, presence: true
end
# app/models/speech_synthesis.rb
class SpeechSynthesis < ApplicationRecord
belongs_to :user
has_one_attached :audio
enum :status, { pending: 0, processing: 1, completed: 2, failed: 3 }
VOICES = %w[alloy echo fable onyx nova shimmer].freeze
validates :voice, inclusion: { in: VOICES }
validates :input_text, presence: true, length: { maximum: 4096 }
end
Whisper Transcription Service
# app/services/whisper_service.rb
class WhisperService
def initialize
@client = OpenAI::Client.new(access_token: ENV["OPENAI_API_KEY"])
end
def transcribe(audio_path, language: nil)
params = {
model: "whisper-1",
file: File.open(audio_path, "rb"),
response_format: "verbose_json"
}
params[:language] = language if language
response = @client.audio.transcribe(parameters: params)
{
text: response["text"],
language: response["language"],
duration: response["duration"],
segments: response["segments"]&.map do |seg|
{ start: seg["start"], end: seg["end"], text: seg["text"] }
end
}
end
end
Text-to-Speech Service
# app/services/tts_service.rb
class TtsService
def initialize
@client = OpenAI::Client.new(access_token: ENV["OPENAI_API_KEY"])
end
def synthesize(text, voice: "nova", model: "tts-1", speed: 1.0)
response = @client.audio.speech(
parameters: {
model: model,
input: text,
voice: voice,
speed: speed,
response_format: "mp3"
}
)
# Response is raw audio bytes
response
end
end
Background Jobs
Both APIs can take a few seconds. Active Job keeps things responsive:
# app/jobs/transcribe_audio_job.rb
class TranscribeAudioJob < ApplicationJob
queue_as :ai_tasks
retry_on Faraday::Error, wait: 5.seconds, attempts: 3
def perform(transcription_id)
transcription = Transcription.find(transcription_id)
transcription.processing!
# Download attached audio to a temp file
tmpfile = Tempfile.new(["audio", ".webm"])
tmpfile.binmode
tmpfile.write(transcription.audio.download)
tmpfile.rewind
result = WhisperService.new.transcribe(tmpfile.path)
transcription.update!(
text: result[:text],
language: result[:language],
duration: result[:duration],
status: :completed
)
broadcast_update(transcription)
rescue StandardError => e
transcription&.failed!
Rails.logger.error("Transcription failed: #{e.message}")
broadcast_update(transcription)
ensure
tmpfile&.close
tmpfile&.unlink
end
private
def broadcast_update(transcription)
Turbo::StreamsChannel.broadcast_replace_to(
"user_#{transcription.user_id}_transcriptions",
target: "transcription_#{transcription.id}",
partial: "transcriptions/transcription",
locals: { transcription: transcription }
)
end
end
# app/jobs/synthesize_speech_job.rb
class SynthesizeSpeechJob < ApplicationJob
queue_as :ai_tasks
retry_on Faraday::Error, wait: 5.seconds, attempts: 3
def perform(speech_id)
speech = SpeechSynthesis.find(speech_id)
speech.processing!
audio_data = TtsService.new.synthesize(
speech.input_text,
voice: speech.voice
)
speech.audio.attach(
io: StringIO.new(audio_data),
filename: "speech_#{speech.id}.mp3",
content_type: "audio/mpeg"
)
speech.completed!
broadcast_update(speech)
rescue StandardError => e
speech&.failed!
Rails.logger.error("Speech synthesis failed: #{e.message}")
broadcast_update(speech)
end
private
def broadcast_update(speech)
Turbo::StreamsChannel.broadcast_replace_to(
"user_#{speech.user_id}_speeches",
target: "speech_#{speech.id}",
partial: "speech_syntheses/speech",
locals: { speech: speech }
)
end
end
Controllers
# app/controllers/transcriptions_controller.rb
class TranscriptionsController < ApplicationController
before_action :authenticate_user!
def index
@transcriptions = current_user.transcriptions.order(created_at: :desc)
end
def create
@transcription = current_user.transcriptions.build(status: :pending)
@transcription.audio.attach(params[:audio])
if @transcription.save
TranscribeAudioJob.perform_later(@transcription.id)
respond_to do |format|
format.turbo_stream
format.html { redirect_to transcriptions_path }
end
else
render :index, status: :unprocessable_entity
end
end
end
# app/controllers/speech_syntheses_controller.rb
class SpeechSynthesesController < ApplicationController
before_action :authenticate_user!
def create
@speech = current_user.speech_syntheses.build(speech_params)
@speech.status = :pending
if @speech.save
SynthesizeSpeechJob.perform_later(@speech.id)
respond_to do |format|
format.turbo_stream
format.html { redirect_to transcriptions_path }
end
else
render :index, status: :unprocessable_entity
end
end
private
def speech_params
params.require(:speech_synthesis).permit(:input_text, :voice)
end
end
Recording Audio in the Browser
Use a Stimulus controller to capture microphone audio:
// app/javascript/controllers/audio_recorder_controller.js
import { Controller } from "@hotwired/stimulus"
export default class extends Controller {
static targets = ["button", "status", "form"]
async start() {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
this.recorder = new MediaRecorder(stream, { mimeType: "audio/webm" })
this.chunks = []
this.recorder.ondataavailable = (e) => this.chunks.push(e.data)
this.recorder.onstop = () => this.submit(stream)
this.recorder.start()
this.buttonTarget.textContent = "Stop Recording"
this.buttonTarget.dataset.action = "click->audio-recorder#stop"
this.statusTarget.textContent = "Recording..."
}
stop() {
this.recorder.stop()
}
submit(stream) {
stream.getTracks().forEach(track => track.stop())
const blob = new Blob(this.chunks, { type: "audio/webm" })
const formData = new FormData()
formData.append("audio", blob, "recording.webm")
fetch(this.formTarget.action, {
method: "POST",
body: formData,
headers: {
"X-CSRF-Token": document.querySelector("[name=csrf-token]").content
}
})
this.buttonTarget.textContent = "Start Recording"
this.buttonTarget.dataset.action = "click->audio-recorder#start"
this.statusTarget.textContent = "Uploaded. Transcribing..."
}
}
The View
<%# app/views/transcriptions/index.html.erb %>
<%= turbo_stream_from "user_#{current_user.id}_transcriptions" %>
<div data-controller="audio-recorder">
<%= form_with url: transcriptions_path, data: { audio_recorder_target: "form" } do %>
<button type="button"
data-audio-recorder-target="button"
data-action="click->audio-recorder#start">
Start Recording
</button>
<span data-audio-recorder-target="status"></span>
<% end %>
</div>
<div id="transcriptions">
<% @transcriptions.each do |t| %>
<%= render "transcription", transcription: t %>
<% end %>
</div>
The transcription partial:
<%# app/views/transcriptions/_transcription.html.erb %>
<div id="transcription_<%= transcription.id %>" class="transcription-card">
<% case transcription.status %>
<% when "pending", "processing" %>
<div class="spinner">Transcribing audio...</div>
<% when "completed" %>
<p><%= transcription.text %></p>
<small><%= transcription.language %> · <%= transcription.duration&.round(1) %>s</small>
<% when "failed" %>
<div class="error">Transcription failed.</div>
<% end %>
</div>
The Full Picture
The pattern is the same one we've used throughout this series: user action → save a pending record → background job does the AI work → Turbo Stream broadcasts the result. Whether it's text generation, image creation, or audio processing, the architecture stays consistent.
Whisper handles audio up to 25MB. For longer recordings, split them into chunks before sending. The TTS API accepts up to 4096 characters per request — for longer text, split by paragraphs and concatenate the audio.
That wraps up Phase 4. We've built a complete AI toolkit in Rails: chat, embeddings, RAG, agents, streaming, image generation, and now voice. Next up is Phase 5 — making all of this production-ready, starting with testing AI features using RSpec.