Building a .NET MAUI application that uses OpenAI's technologies for voice-based interaction, including speech recognition, prompt processing with GPT, and text-to-speech responses.
Project Setup
1. Create a new .NET MAUI project
dotnet new maui -n VoiceAIApp cd VoiceAIApp
2. Add required NuGet packages
dotnet add package OpenAI dotnet add package Microsoft.CognitiveServices.Speech dotnet add package CommunityToolkit.Maui dotnet add package CommunityToolkit.Maui.Media
Implementation
1. Configure App Services (MauiProgram.cs)
using CommunityToolkit.Maui; using Microsoft.Extensions.Logging; public static class MauiProgram { public static MauiApp CreateMauiApp() { var builder = MauiApp.CreateBuilder(); builder .UseMauiApp<App>() .UseMauiCommunityToolkit() .UseMauiCommunityToolkitMedia() .ConfigureFonts(fonts => { fonts.AddFont("OpenSans-Regular.ttf", "OpenSansRegular"); fonts.AddFont("OpenSans-Semibold.ttf", "OpenSansSemibold"); }); #if DEBUG builder.Logging.AddDebug(); #endif // Register services builder.Services.AddSingleton<IOpenAIService, OpenAIService>(); builder.Services.AddSingleton<ISpeechService, SpeechService>(); builder.Services.AddSingleton<MainViewModel>(); builder.Services.AddSingleton<MainPage>(); return builder.Build(); } }
2. OpenAI Service (OpenAIService.cs)
using OpenAI_API; using OpenAI_API.Audio; using OpenAI_API.Chat; public interface IOpenAIService { Task<string> ProcessPromptAsync(string prompt); Task<string> TranscribeAudioAsync(byte[] audioData); Task<Stream> GenerateSpeechAsync(string text); } public class OpenAIService : IOpenAIService { private readonly OpenAIAPI _openAiApi; public OpenAIService() { // Initialize with your API key (store securely!) _openAiApi = new OpenAIAPI("your-openai-api-key"); } public async Task<string> ProcessPromptAsync(string prompt) { var chatRequest = new ChatRequest { Messages = new List<ChatMessage> { new ChatMessage(ChatMessageRole.User, prompt) }, Model = "gpt-4" }; var response = await _openAiApi.Chat.CreateChatCompletionAsync(chatRequest); return response.Choices[0].Message.Content; } public async Task<string> TranscribeAudioAsync(byte[] audioData) { var transcriptionRequest = new TranscriptionRequest { AudioData = audioData, Model = "whisper-1" }; var response = await _openAiApi.Transcriptions.GetTranscriptionAsync(transcriptionRequest); return response.Text; } public async Task<Stream> GenerateSpeechAsync(string text) { var ttsRequest = new TTSRequest { Input = text, Voice = "alloy", // or "echo", "fable", "onyx", "nova", "shimmer" Model = "tts-1", ResponseFormat = "mp3" }; return await _openAiApi.TextToSpeech.GetSpeechAsStreamAsync(ttsRequest); } }
3. Speech Service (SpeechService.cs)
using Microsoft.CognitiveServices.Speech; using Microsoft.CognitiveServices.Speech.Audio; public interface ISpeechService { Task<string> ListenAsync(string locale, CancellationToken cancellationToken); Task SpeakAsync(string text, string locale, CancellationToken cancellationToken); Task<byte[]> CaptureAudioAsync(int durationSeconds, CancellationToken cancellationToken); } public class SpeechService : ISpeechService { private readonly string _speechKey = "your-azure-speech-key"; private readonly string _speechRegion = "your-region"; public async Task<string> ListenAsync(string locale, CancellationToken cancellationToken) { var config = SpeechConfig.FromSubscription(_speechKey, _speechRegion); config.SpeechRecognitionLanguage = locale; using var audioConfig = AudioConfig.FromDefaultMicrophoneInput(); using var recognizer = new SpeechRecognizer(config, audioConfig); var result = await recognizer.RecognizeOnceAsync(); return result.Text; } public async Task SpeakAsync(string text, string locale, CancellationToken cancellationToken) { var config = SpeechConfig.FromSubscription(_speechKey, _speechRegion); config.SpeechSynthesisLanguage = locale; using var synthesizer = new SpeechSynthesizer(config); await synthesizer.SpeakTextAsync(text); } public async Task<byte[]> CaptureAudioAsync(int durationSeconds, CancellationToken cancellationToken) { // This is a simplified version - you'd need to implement proper audio capture var audioConfig = AudioConfig.FromDefaultMicrophoneInput(); var audioStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1)); // Capture audio for specified duration // Implementation depends on your specific requirements return new byte[0]; // Return captured audio bytes } }
4. ViewModel (MainViewModel.cs)
using CommunityToolkit.Mvvm.ComponentModel; using CommunityToolkit.Mvvm.Input; public partial class MainViewModel : ObservableObject { private readonly IOpenAIService _openAiService; private readonly ISpeechService _speechService; [ObservableProperty] private string _statusMessage; [ObservableProperty] private bool _isListening; public MainViewModel(IOpenAIService openAiService, ISpeechService speechService) { _openAiService = openAiService; _speechService = speechService; StatusMessage = "Ready to listen..."; } [RelayCommand] private async Task StartListeningAsync(string locale = "en-US") { if (IsListening) return; IsListening = true; StatusMessage = "Listening..."; try { // Option 1: Use Azure Speech-to-Text // var spokenText = await _speechService.ListenAsync(locale, CancellationToken.None); // Option 2: Use Whisper for better accuracy var audioData = await _speechService.CaptureAudioAsync(5, CancellationToken.None); var spokenText = await _openAiService.TranscribeAudioAsync(audioData); if (!string.IsNullOrWhiteSpace(spokenText)) { StatusMessage = "Processing..."; var response = await _openAiService.ProcessPromptAsync(spokenText); StatusMessage = "Speaking..."; await _speechService.SpeakAsync(response, locale, CancellationToken.None); } } catch (Exception ex) { StatusMessage = $"Error: {ex.Message}"; } finally { IsListening = false; StatusMessage = "Ready to listen..."; } } }
5. Main Page (MainPage.xaml)
<?xml version="1.0" encoding="utf-8" ?> <ContentPage xmlns="http://schemas.microsoft.com/dotnet/2021/maui" xmlns:x="http://schemas.microsoft.com/winfx/2009/xaml" xmlns:viewmodel="clr-namespace:VoiceAIApp" x:Class="VoiceAIApp.MainPage" Title="Voice AI Assistant"> <ContentPage.BindingContext> <viewmodel:MainViewModel /> </ContentPage.BindingContext> <ScrollView> <VerticalStackLayout Spacing="25" Padding="30,0" VerticalOptions="Center"> <Label Text="Voice AI Assistant" SemanticProperties.HeadingLevel="Level1" FontSize="32" HorizontalOptions="Center" /> <Label Text="Speak and get AI responses" SemanticProperties.HeadingLevel="Level2" FontSize="18" HorizontalOptions="Center" /> <Label Text="{Binding StatusMessage}" FontSize="16" HorizontalOptions="Center" TextColor="{AppThemeBinding Light={StaticResource Primary}, Dark={StaticResource Secondary}}" /> <Button Text="{Binding IsListening, Converter={StaticResource ListeningConverter}}" Command="{Binding StartListeningCommand}" CommandParameter="en-US" HorizontalOptions="Center" IsEnabled="{Binding IsListening, Converter={StaticResource InverseBoolConverter}}" SemanticProperties.Hint="Tap to start listening" /> <Button Text="Stop Listening" Command="{Binding StopListeningCommand}" HorizontalOptions="Center" IsVisible="{Binding IsListening}" SemanticProperties.Hint="Tap to stop listening" /> <Picker x:Name="LanguagePicker" Title="Select Language" HorizontalOptions="Center" WidthRequest="200"> <Picker.Items> <x:String>en-US</x:String> <x:String>es-ES</x:String> <x:String>fr-FR</x:String> <x:String>de-DE</x:String> <x:String>it-IT</x:String> <x:String>ja-JP</x:String> </Picker.Items> </Picker> </VerticalStackLayout> </ScrollView> </ContentPage>
6. Main Page Code-behind (MainPage.xaml.cs)
public partial class MainPage : ContentPage { public MainPage(MainViewModel viewModel) { InitializeComponent(); BindingContext = viewModel; } }
7. Converters (for XAML bindings)
public class ListeningConverter : IValueConverter { public object Convert(object value, Type targetType, object parameter, CultureInfo culture) { return (bool)value ? "Listening..." : "Start Listening"; } public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture) { throw new NotImplementedException(); } } public class InverseBoolConverter : IValueConverter { public object Convert(object value, Type targetType, object parameter, CultureInfo culture) { return !(bool)value; } public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture) { throw new NotImplementedException(); } }
Platform-Specific Configuration
Android (Platforms/Android/MainApplication.cs)
[assembly: UsesPermission(Android.Manifest.Permission.RecordAudio)] [assembly: UsesPermission(Android.Manifest.Permission.Internet)]
iOS (Platforms/iOS/Info.plist)
<key>NSMicrophoneUsageDescription</key> <string>This app needs access to microphone for voice commands</string> <key>UIBackgroundModes</key> <array> <string>audio</string> </array>
Continuous Listening Implementation
For continuous listening (like a wake word feature), you would need to implement an audio processing pipeline. Here's a basic approach:
public class ContinuousListener { private readonly ISpeechService _speechService; private readonly IOpenAIService _openAiService; private CancellationTokenSource _cts; public ContinuousListener(ISpeechService speechService, IOpenAIService openAiService) { _speechService = speechService; _openAiService = openAiService; } public async Task StartContinuousListeningAsync(string locale) { _cts = new CancellationTokenSource(); while (!_cts.IsCancellationRequested) { try { var audioData = await _speechService.CaptureAudioAsync(1, _cts.Token); var text = await _openAiService.TranscribeAudioAsync(audioData); if (IsWakeWord(text)) { // Process the command var response = await _openAiService.ProcessPromptAsync(text); await _speechService.SpeakAsync(response, locale, _cts.Token); } } catch (OperationCanceledException) { // Listening was cancelled } } } public void StopListening() { _cts?.Cancel(); } private bool IsWakeWord(string text) { // Implement your wake word detection logic return text.Contains("hey assistant", StringComparison.OrdinalIgnoreCase); } }
Important Considerations
API Keys: Store your OpenAI and Azure Speech keys securely, preferably in a secrets manager or secure configuration.
Error Handling: Implement robust error handling for network issues, API limits, etc.
Performance: Audio processing can be resource-intensive. Optimize for your target platforms.
Privacy: Clearly inform users when you're recording audio and how the data is used.
Multilingual Support: The implementation supports multiple languages by passing different locale codes.
Whisper Integration: Using Whisper for transcription typically provides better accuracy than standard speech recognition services.
This implementation provides a complete foundation for a voice-interactive AI app using .NET MAUI and OpenAI technologies. You can extend it with additional features like conversation history, custom wake words, or specialized AI personas.
No comments:
Post a Comment