.NET MAUI App with OpenAI Integration for Voice Interaction

Building a .NET MAUI application that uses OpenAI's technologies for voice-based interaction, including speech recognition, prompt processing with GPT, and text-to-speech responses.

Project Setup

1. Create a new .NET MAUI project

dotnet new maui -n VoiceAIApp
cd VoiceAIApp

2. Add required NuGet packages

dotnet add package OpenAI
dotnet add package Microsoft.CognitiveServices.Speech
dotnet add package CommunityToolkit.Maui
dotnet add package CommunityToolkit.Maui.Media

Implementation

1. Configure App Services (MauiProgram.cs)

using CommunityToolkit.Maui;
using Microsoft.Extensions.Logging;

public static class MauiProgram
{
    public static MauiApp CreateMauiApp()
    {
        var builder = MauiApp.CreateBuilder();
        builder
            .UseMauiApp<App>()
            .UseMauiCommunityToolkit()
            .UseMauiCommunityToolkitMedia()
            .ConfigureFonts(fonts =>
            {
                fonts.AddFont("OpenSans-Regular.ttf", "OpenSansRegular");
                fonts.AddFont("OpenSans-Semibold.ttf", "OpenSansSemibold");
            });

#if DEBUG
        builder.Logging.AddDebug();
#endif

        // Register services
        builder.Services.AddSingleton<IOpenAIService, OpenAIService>();
        builder.Services.AddSingleton<ISpeechService, SpeechService>();
        builder.Services.AddSingleton<MainViewModel>();
        builder.Services.AddSingleton<MainPage>();

        return builder.Build();
    }
}

2. OpenAI Service (OpenAIService.cs)

using OpenAI_API;
using OpenAI_API.Audio;
using OpenAI_API.Chat;

public interface IOpenAIService
{
    Task<string> ProcessPromptAsync(string prompt);
    Task<string> TranscribeAudioAsync(byte[] audioData);
    Task<Stream> GenerateSpeechAsync(string text);
}

public class OpenAIService : IOpenAIService
{
    private readonly OpenAIAPI _openAiApi;
    
    public OpenAIService()
    {
        // Initialize with your API key (store securely!)
        _openAiApi = new OpenAIAPI("your-openai-api-key");
    }

    public async Task<string> ProcessPromptAsync(string prompt)
    {
        var chatRequest = new ChatRequest
        {
            Messages = new List<ChatMessage>
            {
                new ChatMessage(ChatMessageRole.User, prompt)
            },
            Model = "gpt-4"
        };
        
        var response = await _openAiApi.Chat.CreateChatCompletionAsync(chatRequest);
        return response.Choices[0].Message.Content;
    }

    public async Task<string> TranscribeAudioAsync(byte[] audioData)
    {
        var transcriptionRequest = new TranscriptionRequest
        {
            AudioData = audioData,
            Model = "whisper-1"
        };
        
        var response = await _openAiApi.Transcriptions.GetTranscriptionAsync(transcriptionRequest);
        return response.Text;
    }

    public async Task<Stream> GenerateSpeechAsync(string text)
    {
        var ttsRequest = new TTSRequest
        {
            Input = text,
            Voice = "alloy", // or "echo", "fable", "onyx", "nova", "shimmer"
            Model = "tts-1",
            ResponseFormat = "mp3"
        };
        
        return await _openAiApi.TextToSpeech.GetSpeechAsStreamAsync(ttsRequest);
    }
}

3. Speech Service (SpeechService.cs)

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;

public interface ISpeechService
{
    Task<string> ListenAsync(string locale, CancellationToken cancellationToken);
    Task SpeakAsync(string text, string locale, CancellationToken cancellationToken);
    Task<byte[]> CaptureAudioAsync(int durationSeconds, CancellationToken cancellationToken);
}

public class SpeechService : ISpeechService
{
    private readonly string _speechKey = "your-azure-speech-key";
    private readonly string _speechRegion = "your-region";
    
    public async Task<string> ListenAsync(string locale, CancellationToken cancellationToken)
    {
        var config = SpeechConfig.FromSubscription(_speechKey, _speechRegion);
        config.SpeechRecognitionLanguage = locale;
        
        using var audioConfig = AudioConfig.FromDefaultMicrophoneInput();
        using var recognizer = new SpeechRecognizer(config, audioConfig);
        
        var result = await recognizer.RecognizeOnceAsync();
        return result.Text;
    }

    public async Task SpeakAsync(string text, string locale, CancellationToken cancellationToken)
    {
        var config = SpeechConfig.FromSubscription(_speechKey, _speechRegion);
        config.SpeechSynthesisLanguage = locale;
        
        using var synthesizer = new SpeechSynthesizer(config);
        await synthesizer.SpeakTextAsync(text);
    }

    public async Task<byte[]> CaptureAudioAsync(int durationSeconds, CancellationToken cancellationToken)
    {
        // This is a simplified version - you'd need to implement proper audio capture
        var audioConfig = AudioConfig.FromDefaultMicrophoneInput();
        var audioStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1));
        
        // Capture audio for specified duration
        // Implementation depends on your specific requirements
        
        return new byte[0]; // Return captured audio bytes
    }
}

4. ViewModel (MainViewModel.cs)

using CommunityToolkit.Mvvm.ComponentModel;
using CommunityToolkit.Mvvm.Input;

public partial class MainViewModel : ObservableObject
{
    private readonly IOpenAIService _openAiService;
    private readonly ISpeechService _speechService;
    
    [ObservableProperty]
    private string _statusMessage;
    
    [ObservableProperty]
    private bool _isListening;
    
    public MainViewModel(IOpenAIService openAiService, ISpeechService speechService)
    {
        _openAiService = openAiService;
        _speechService = speechService;
        StatusMessage = "Ready to listen...";
    }
    
    [RelayCommand]
    private async Task StartListeningAsync(string locale = "en-US")
    {
        if (IsListening) return;
        
        IsListening = true;
        StatusMessage = "Listening...";
        
        try
        {
            // Option 1: Use Azure Speech-to-Text
            // var spokenText = await _speechService.ListenAsync(locale, CancellationToken.None);
            
            // Option 2: Use Whisper for better accuracy
            var audioData = await _speechService.CaptureAudioAsync(5, CancellationToken.None);
            var spokenText = await _openAiService.TranscribeAudioAsync(audioData);
            
            if (!string.IsNullOrWhiteSpace(spokenText))
            {
                StatusMessage = "Processing...";
                var response = await _openAiService.ProcessPromptAsync(spokenText);
                
                StatusMessage = "Speaking...";
                await _speechService.SpeakAsync(response, locale, CancellationToken.None);
            }
        }
        catch (Exception ex)
        {
            StatusMessage = $"Error: {ex.Message}";
        }
        finally
        {
            IsListening = false;
            StatusMessage = "Ready to listen...";
        }
    }
}

5. Main Page (MainPage.xaml)

<?xml version="1.0" encoding="utf-8" ?>
<ContentPage xmlns="http://schemas.microsoft.com/dotnet/2021/maui"
             xmlns:x="http://schemas.microsoft.com/winfx/2009/xaml"
             xmlns:viewmodel="clr-namespace:VoiceAIApp"
             x:Class="VoiceAIApp.MainPage"
             Title="Voice AI Assistant">

    <ContentPage.BindingContext>
        <viewmodel:MainViewModel />
    </ContentPage.BindingContext>

    <ScrollView>
        <VerticalStackLayout 
            Spacing="25" 
            Padding="30,0" 
            VerticalOptions="Center">

            <Label 
                Text="Voice AI Assistant"
                SemanticProperties.HeadingLevel="Level1"
                FontSize="32"
                HorizontalOptions="Center" />

            <Label 
                Text="Speak and get AI responses"
                SemanticProperties.HeadingLevel="Level2"
                FontSize="18"
                HorizontalOptions="Center" />

            <Label 
                Text="{Binding StatusMessage}"
                FontSize="16"
                HorizontalOptions="Center"
                TextColor="{AppThemeBinding Light={StaticResource Primary}, Dark={StaticResource Secondary}}" />

            <Button 
                Text="{Binding IsListening, Converter={StaticResource ListeningConverter}}"
                Command="{Binding StartListeningCommand}"
                CommandParameter="en-US"
                HorizontalOptions="Center"
                IsEnabled="{Binding IsListening, Converter={StaticResource InverseBoolConverter}}"
                SemanticProperties.Hint="Tap to start listening" />

            <Button 
                Text="Stop Listening"
                Command="{Binding StopListeningCommand}"
                HorizontalOptions="Center"
                IsVisible="{Binding IsListening}"
                SemanticProperties.Hint="Tap to stop listening" />

            <Picker x:Name="LanguagePicker"
                    Title="Select Language"
                    HorizontalOptions="Center"
                    WidthRequest="200">
                <Picker.Items>
                    <x:String>en-US</x:String>
                    <x:String>es-ES</x:String>
                    <x:String>fr-FR</x:String>
                    <x:String>de-DE</x:String>
                    <x:String>it-IT</x:String>
                    <x:String>ja-JP</x:String>
                </Picker.Items>
            </Picker>

        </VerticalStackLayout>
    </ScrollView>
</ContentPage>

6. Main Page Code-behind (MainPage.xaml.cs)

public partial class MainPage : ContentPage
{
    public MainPage(MainViewModel viewModel)
    {
        InitializeComponent();
        BindingContext = viewModel;
    }
}

7. Converters (for XAML bindings)

public class ListeningConverter : IValueConverter
{
    public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
    {
        return (bool)value ? "Listening..." : "Start Listening";
    }

    public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
    {
        throw new NotImplementedException();
    }
}

public class InverseBoolConverter : IValueConverter
{
    public object Convert(object value, Type targetType, object parameter, CultureInfo culture)
    {
        return !(bool)value;
    }

    public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture)
    {
        throw new NotImplementedException();
    }
}

Platform-Specific Configuration

Android (Platforms/Android/MainApplication.cs)

[assembly: UsesPermission(Android.Manifest.Permission.RecordAudio)]
[assembly: UsesPermission(Android.Manifest.Permission.Internet)]

iOS (Platforms/iOS/Info.plist)

<key>NSMicrophoneUsageDescription</key>
<string>This app needs access to microphone for voice commands</string>
<key>UIBackgroundModes</key>
<array>
    <string>audio</string>
</array>

Continuous Listening Implementation

For continuous listening (like a wake word feature), you would need to implement an audio processing pipeline. Here's a basic approach:

public class ContinuousListener
{
    private readonly ISpeechService _speechService;
    private readonly IOpenAIService _openAiService;
    private CancellationTokenSource _cts;
    
    public ContinuousListener(ISpeechService speechService, IOpenAIService openAiService)
    {
        _speechService = speechService;
        _openAiService = openAiService;
    }
    
    public async Task StartContinuousListeningAsync(string locale)
    {
        _cts = new CancellationTokenSource();
        
        while (!_cts.IsCancellationRequested)
        {
            try
            {
                var audioData = await _speechService.CaptureAudioAsync(1, _cts.Token);
                var text = await _openAiService.TranscribeAudioAsync(audioData);
                
                if (IsWakeWord(text))
                {
                    // Process the command
                    var response = await _openAiService.ProcessPromptAsync(text);
                    await _speechService.SpeakAsync(response, locale, _cts.Token);
                }
            }
            catch (OperationCanceledException)
            {
                // Listening was cancelled
            }
        }
    }
    
    public void StopListening()
    {
        _cts?.Cancel();
    }
    
    private bool IsWakeWord(string text)
    {
        // Implement your wake word detection logic
        return text.Contains("hey assistant", StringComparison.OrdinalIgnoreCase);
    }
}

Important Considerations

API Keys: Store your OpenAI and Azure Speech keys securely, preferably in a secrets manager or secure configuration.
Error Handling: Implement robust error handling for network issues, API limits, etc.
Performance: Audio processing can be resource-intensive. Optimize for your target platforms.
Privacy: Clearly inform users when you're recording audio and how the data is used.
Multilingual Support: The implementation supports multiple languages by passing different locale codes.
Whisper Integration: Using Whisper for transcription typically provides better accuracy than standard speech recognition services.

This implementation provides a complete foundation for a voice-interactive AI app using .NET MAUI and OpenAI technologies. You can extend it with additional features like conversation history, custom wake words, or specialized AI personas.

Welcome to CodeXamarin – Your Ultimate Xamarin & .NET MAUI Resource Hub!

Saturday, 2 August 2025