> ## Documentation Index
> Fetch the complete documentation index at: https://docs.fish.audio/llms.txt
> Use this file to discover all available pages before exploring further.

# Speech to Text Guide

> Convert audio recordings into accurate text transcriptions

export const AudioTranscript = ({voices, page}) => {
  const resolvedVoices = voices?.length ? voices : (() => {
    if (!page) return [];
    const baseUrl = 'https://pub-b995142090474379a930b856ab79b4d4.r2.dev/audio';
    const pageVoices = [{
      id: '8ef4a238714b45718ce04243307c57a7',
      name: 'E-girl'
    }, {
      id: '802e3bc2b27e49c2995d23ef70e6ac89',
      name: 'Energetic Male'
    }, {
      id: '933563129e564b19a115bedd57b7406a',
      name: 'Sarah'
    }, {
      id: 'bf322df2096a46f18c579d0baa36f41d',
      name: 'Adrian'
    }, {
      id: 'b347db033a6549378b48d00acb0d06cd',
      name: 'Selene'
    }, {
      id: '536d3a5e000945adb7038665781a4aca',
      name: 'Ethan'
    }];
    return pageVoices.map(voice => ({
      ...voice,
      url: `${baseUrl}/${page}/${voice.id}.mp3`
    }));
  })();
  const [selectedVoice, setSelectedVoice] = useState(0);
  const [isPlaying, setIsPlaying] = useState(false);
  const [currentTime, setCurrentTime] = useState(0);
  const [duration, setDuration] = useState(0);
  const [isDropdownOpen, setIsDropdownOpen] = useState(false);
  const audioRef = useRef(null);
  const dropdownRef = useRef(null);
  useEffect(() => {
    const audio = audioRef.current;
    if (!audio) return;
    const updateTime = () => setCurrentTime(audio.currentTime);
    const updateDuration = () => setDuration(audio.duration);
    const handleEnded = () => setIsPlaying(false);
    audio.addEventListener('timeupdate', updateTime);
    audio.addEventListener('loadedmetadata', updateDuration);
    audio.addEventListener('ended', handleEnded);
    return () => {
      audio.removeEventListener('timeupdate', updateTime);
      audio.removeEventListener('loadedmetadata', updateDuration);
      audio.removeEventListener('ended', handleEnded);
    };
  }, []);
  useEffect(() => {
    const handleClickOutside = event => {
      if (dropdownRef.current && !dropdownRef.current.contains(event.target)) {
        setIsDropdownOpen(false);
      }
    };
    if (isDropdownOpen) {
      document.addEventListener('mousedown', handleClickOutside);
    }
    return () => {
      document.removeEventListener('mousedown', handleClickOutside);
    };
  }, [isDropdownOpen]);
  useEffect(() => {
    if (audioRef.current) {
      audioRef.current.pause();
      audioRef.current.load();
      setIsPlaying(false);
      setCurrentTime(0);
    }
  }, [selectedVoice]);
  const togglePlay = () => {
    if (isPlaying) {
      audioRef.current.pause();
    } else {
      audioRef.current.play();
    }
    setIsPlaying(!isPlaying);
  };
  const handleProgressChange = e => {
    const newTime = parseFloat(e.target.value);
    audioRef.current.currentTime = newTime;
    setCurrentTime(newTime);
  };
  const formatTime = time => {
    if (isNaN(time)) return '0:00';
    const minutes = Math.floor(time / 60);
    const seconds = Math.floor(time % 60);
    return `${minutes}:${seconds.toString().padStart(2, '0')}`;
  };
  const currentVoice = resolvedVoices[selectedVoice];
  return <div className="border rounded-lg bg-card border-gray-200 dark:border-gray-800">
      {}
      <div className="grid grid-cols-3 items-center px-3 py-1.5 bg-muted border-b border-gray-200 dark:border-gray-800">
        <span className="text-xs font-medium">Listen to Page</span>

        <span className="text-xs font-semibold text-muted-foreground text-center">Powered by Fish Audio S2 Pro</span>

        {resolvedVoices.length > 1 ? <div className="relative justify-self-end" ref={dropdownRef}>
            <button onClick={() => setIsDropdownOpen(!isDropdownOpen)} className="flex items-center gap-1.5 px-3 py-1 rounded-full bg-muted hover:bg-gray-200 dark:hover:bg-gray-700 transition-all duration-200 cursor-pointer text-xs">
              <span className="text-muted-foreground">Voice:</span>
              <span className="font-medium">{resolvedVoices[selectedVoice]?.name}</span>
              <svg className={`w-3 h-3 transition-transform duration-200 ${isDropdownOpen ? 'rotate-180' : ''}`} fill="none" stroke="currentColor" viewBox="0 0 24 24">
                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
              </svg>
            </button>

            {isDropdownOpen && <div className="absolute right-0 mt-1 w-auto bg-white dark:bg-black border border-gray-200 dark:border-gray-700 rounded-lg overflow-hidden z-50">
                {resolvedVoices.map((voice, index) => <button key={index} onClick={() => {
    setSelectedVoice(index);
    setIsDropdownOpen(false);
  }} className={`w-full px-3 py-1.5 text-left text-xs hover:bg-gray-100 dark:hover:bg-gray-800 transition-colors flex items-center gap-2 ${index === selectedVoice ? 'bg-gray-100 dark:bg-gray-800 font-medium' : ''}`}>
                    {voice.id && <img src={`https://public-platform.r2.fish.audio/coverimage/${voice.id}`} alt={voice.name} className="w-5 h-5 rounded-full m-0 flex-shrink-0 object-cover" />}
                    <span className="flex-1 whitespace-nowrap">{voice.name}</span>
                  </button>)}
              </div>}
          </div> : <div className="justify-self-end" />}
      </div>

      {}
      <div className="px-3 py-1.5 bg-card">
        <audio ref={audioRef} src={currentVoice?.url} preload="metadata" />

        <div className="flex items-center gap-2">
          {}
          <button onClick={togglePlay} className="flex-shrink-0 w-6 h-6 flex items-center justify-center bg-gray-300 dark:bg-gray-600 text-gray-800 dark:text-gray-200 rounded-full hover:opacity-80 transition-opacity relative overflow-hidden" aria-label={isPlaying ? 'Pause' : 'Play'}>
            <div className="transition-transform duration-300 ease-in-out" style={{
    transform: isPlaying ? 'rotate(180deg)' : 'rotate(0deg)'
  }}>
              {isPlaying ? <svg className="w-3 h-3" fill="currentColor" viewBox="0 0 24 24">
                  <path d="M6 4h4v16H6V4zm8 0h4v16h-4V4z" />
                </svg> : <svg className="w-3 h-3 ml-0.5" fill="currentColor" viewBox="0 0 24 24">
                  <path d="M8 5v14l11-7z" />
                </svg>}
            </div>
          </button>

          {}
          <div className="flex-1 flex items-center gap-2">
            <span className="text-xs font-mono text-gray-500 dark:text-gray-400 min-w-[35px]">
              {formatTime(currentTime)}
            </span>

            <div className="flex-1 relative h-1 bg-gray-200 dark:bg-gray-700 rounded-full overflow-hidden">
              <div className="absolute top-0 left-0 h-full bg-gray-400 dark:bg-gray-500 transition-all duration-100" style={{
    width: `${duration ? currentTime / duration * 100 : 0}%`
  }} />
              <input type="range" min="0" max={duration || 0} value={currentTime} onChange={handleProgressChange} className="absolute top-0 left-0 w-full h-full opacity-0 cursor-pointer" />
            </div>
            <span className="text-xs font-mono text-gray-500 dark:text-gray-400 min-w-[35px]">
              {formatTime(duration)}
            </span>
          </div>
        </div>
      </div>
    </div>;
};

## Overview

Transform any audio recording into text with Fish Audio's speech recognition. Perfect for transcriptions, subtitles, and voice commands.

## Getting Started

### Web Interface

Transcribe audio instantly:

<Steps>
  <Step title="Visit Fish Audio">
    Go to [fish.audio](https://fish.audio) and log in
  </Step>

  <Step title="Navigate to Transcribe">
    Click on "Speech to Text" in your dashboard
  </Step>

  <Step title="Upload Audio">
    Select your audio file (MP3, WAV, M4A)
  </Step>

  <Step title="Get Transcription">
    Click "Transcribe" and copy your text
  </Step>
</Steps>

## Supported Formats

### Audio Files

**Accepted formats:**

* MP3 (recommended)
* WAV
* M4A
* OGG
* FLAC
* AAC

**File requirements:**

* Maximum size: 20MB
* Maximum duration: 60 minutes
* Minimum duration: 1 second

## Language Support

### Automatic Detection

The system automatically detects the language spoken in your audio. No configuration needed!

### Manual Selection

For better accuracy, specify the language:

**Major Languages:**

* English (en)
* Chinese (zh)
* Japanese (ja)

With **additional languages** to be supported soon!

## Audio Quality Tips

### For Best Results

**Recording Environment:**

* Quiet room with minimal echo
* No background music
* Clear, consistent speaking voice
* One speaker at a time

**Audio Settings:**

* Sample rate: 16kHz or higher
* Bit rate: 128kbps or higher
* Mono or stereo (mono preferred)

### Common Issues

**Poor transcription quality?**

* Remove background noise
* Increase microphone volume
* Speak clearly and not too fast
* Avoid multiple speakers talking over each other

## Use Cases

### Meeting Transcription

Convert recorded meetings into searchable text:

1. Record your meeting (Zoom, Teams, etc.)
2. Export the audio file
3. Upload to Fish Audio
4. Get formatted transcription with timestamps

### Podcast Transcripts

Create written versions of your podcasts:

* Generate show notes automatically
* Create searchable content
* Improve accessibility
* Enable translations

### Video Subtitles

Generate subtitles for your videos:

1. Extract audio from video
2. Transcribe with Fish Audio
3. Get timestamped text
4. Import into video editor

### Voice Notes

Convert voice memos to text:

* Dictate ideas quickly
* Transcribe later for editing
* Search through voice notes
* Share as text documents

## Advanced Features

### Timestamps

Get precise timing for each spoken segment:

```
[00:00:00] Welcome to our podcast.
[00:00:03] Today we're discussing AI technology.
[00:00:07] Let's dive right in.
```

Perfect for:

* Creating subtitles
* Navigating long recordings
* Synchronizing with video
* Building searchable archives

### Speaker Detection

Identify different speakers in conversations:

```
Speaker 1: "What do you think about the proposal?"
Speaker 2: "I think it has potential."
Speaker 1: "Let's discuss the details."
```

### Punctuation & Formatting

Automatic formatting includes:

* Sentence capitalization
* Punctuation marks
* Paragraph breaks
* Number formatting

## Tips for Different Content

### Interviews

**Best practices:**

* Use a good microphone for each speaker
* Record in a quiet environment
* Speak one at a time
* Keep consistent volume levels

### Lectures & Presentations

**Optimize for:**

* Clear articulation of technical terms
* Pause between topics
* Repeat important points
* Avoid reading too fast

### Phone Calls

**Considerations:**

* Phone audio is lower quality
* Expect slightly lower accuracy
* Speak clearly and slowly
* Avoid speakerphone if possible

## Accuracy Expectations

### What Affects Accuracy

**Positive factors:**

* Clear audio quality
* Native speaker accent
* Common vocabulary
* Single speaker

**Challenging factors:**

* Heavy accents
* Technical jargon
* Multiple speakers
* Background noise

### Typical Accuracy Rates

* **Professional recording:** 95-98%
* **Clean amateur recording:** 90-95%
* **Phone/video calls:** 85-90%
* **Noisy environments:** 75-85%

## Post-Processing Tips

### Editing Transcriptions

After transcription:

1. **Review for accuracy** - Check names and technical terms
2. **Add formatting** - Break into paragraphs
3. **Correct errors** - Fix any misheard words
4. **Add context** - Include speaker names

### Export Options

Save your transcriptions as:

* Plain text (.txt)
* Word document (.docx)
* Subtitle file (.srt)
* PDF document

## Common Applications

### Business

* Meeting minutes
* Interview transcripts
* Call recordings
* Training materials

### Education

* Lecture notes
* Research interviews
* Student recordings
* Language learning

### Content Creation

* Video scripts
* Podcast show notes
* Social media captions
* Blog post drafts

### Accessibility

* Hearing impaired support
* Multi-language content
* Searchable archives
* Documentation

## Troubleshooting

### No Text Output

**Check:**

* Audio file isn't corrupted
* File format is supported
* Audio contains speech
* Volume is audible

### Incorrect Language

**Solutions:**

* Manually select the correct language
* Ensure majority of audio is in one language
* Separate multi-language content

### Missing Words

**Common causes:**

* Speaking too fast
* Mumbling or unclear speech
* Technical terms not recognized
* Very quiet sections

## Privacy & Security

### Your Data

* Audio files are processed securely
* Transcriptions are private to your account
* Files are not used for training
* Delete anytime from your account

### Sensitive Content

For confidential audio:

* Use on-premise solutions if available
* Review privacy policy
* Consider redacting sensitive information
* Download and delete after processing

## Best Practices Summary

1. **Start with quality audio** - Good input = good output
2. **Choose the right environment** - Quiet spaces work best
3. **Speak clearly** - Articulate and consistent pace
4. **Review and edit** - All transcriptions benefit from review
5. **Use appropriate tools** - Different content needs different approaches

## Get Support

Need help with transcription?

* **Try it free:** [fish.audio](https://fish.audio)
* **Community:** [Discord](https://discord.gg/fish-audio)
* **Email:** [support@fish.audio](mailto:support@fish.audio)
* **Status:** [status.fish.audio](https://status.fish.audio)
