diff --git a/ext/bg/background.html b/ext/bg/background.html
index 7fd1c477..f2f70d4d 100644
--- a/ext/bg/background.html
+++ b/ext/bg/background.html
@@ -39,7 +39,7 @@
         <script src="/bg/js/request.js"></script>
         <script src="/bg/js/translator.js"></script>
         <script src="/bg/js/util.js"></script>
-        <script src="/mixed/js/audio.js"></script>
+        <script src="/mixed/js/audio-system.js"></script>
 
         <script src="/bg/js/backend.js"></script>
     </body>
diff --git a/ext/bg/js/api.js b/ext/bg/js/api.js
index 93e43a7d..4e5d81db 100644
--- a/ext/bg/js/api.js
+++ b/ext/bg/js/api.js
@@ -21,10 +21,6 @@ function apiTemplateRender(template, data) {
     return _apiInvoke('templateRender', {data, template});
 }
 
-function apiAudioGetUrl(definition, source, optionsContext) {
-    return _apiInvoke('audioGetUrl', {definition, source, optionsContext});
-}
-
 function _apiInvoke(action, params={}) {
     const data = {action, params};
     return new Promise((resolve, reject) => {
diff --git a/ext/bg/js/audio.js b/ext/bg/js/audio.js
index 972e2b8b..c94121ae 100644
--- a/ext/bg/js/audio.js
+++ b/ext/bg/js/audio.js
@@ -16,7 +16,7 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-/*global jpIsStringEntirelyKana, audioGetFromSources*/
+/*global jpIsStringEntirelyKana*/
 
 const audioUrlBuilders = new Map([
     ['jpod101', async (definition) => {
@@ -154,7 +154,7 @@ function audioBuildFilename(definition) {
     return null;
 }
 
-async function audioInject(definition, fields, sources, optionsContext) {
+async function audioInject(definition, fields, sources, optionsContext, audioSystem) {
     let usesAudio = false;
     for (const fieldValue of Object.values(fields)) {
         if (fieldValue.includes('{audio}')) {
@@ -171,12 +171,10 @@ async function audioInject(definition, fields, sources, optionsContext) {
         const expressions = definition.expressions;
         const audioSourceDefinition = Array.isArray(expressions) ? expressions[0] : definition;
 
-        const {url} = await audioGetFromSources(audioSourceDefinition, sources, optionsContext, true);
-        if (url !== null) {
-            const filename = audioBuildFilename(audioSourceDefinition);
-            if (filename !== null) {
-                definition.audio = {url, filename};
-            }
+        const {uri} = await audioSystem.getDefinitionAudio(audioSourceDefinition, sources, {tts: false, optionsContext});
+        const filename = audioBuildFilename(audioSourceDefinition);
+        if (filename !== null) {
+            definition.audio = {url: uri, filename};
         }
 
         return true;
diff --git a/ext/bg/js/backend.js b/ext/bg/js/backend.js
index 04bf240d..60a87916 100644
--- a/ext/bg/js/backend.js
+++ b/ext/bg/js/backend.js
@@ -23,7 +23,7 @@ requestText, requestJson, optionsLoad
 dictConfigured, dictTermsSort, dictEnabledSet, dictNoteFormat
 audioGetUrl, audioInject
 jpConvertReading, jpDistributeFuriganaInflected, jpKatakanaToHiragana
-Translator, AnkiConnect, AnkiNull, Mecab, BackendApiForwarder, JsonSchema, ClipboardMonitor*/
+AudioSystem, Translator, AnkiConnect, AnkiNull, Mecab, BackendApiForwarder, JsonSchema, ClipboardMonitor*/
 
 class Backend {
     constructor() {
@@ -34,6 +34,7 @@ class Backend {
         this.options = null;
         this.optionsSchema = null;
         this.defaultAnkiFieldTemplates = null;
+        this.audioSystem = new AudioSystem({getAudioUri: this._getAudioUri.bind(this)});
         this.optionsContext = {
             depth: 0,
             url: window.location.href
@@ -436,7 +437,8 @@ class Backend {
                 definition,
                 options.anki.terms.fields,
                 options.audio.sources,
-                optionsContext
+                optionsContext,
+                this.audioSystem
             );
         }
 
@@ -762,6 +764,16 @@ class Backend {
 
     // Utilities
 
+    async _getAudioUri(definition, source, details) {
+        let optionsContext = (typeof details === 'object' && details !== null ? details.optionsContext : null);
+        if (!(typeof optionsContext === 'object' && optionsContext !== null)) {
+            optionsContext = this.optionsContext;
+        }
+
+        const options = this.getOptions(optionsContext);
+        return await audioGetUrl(definition, source, options);
+    }
+
     async _injectScreenshot(definition, fields, screenshot) {
         let usesScreenshot = false;
         for (const fieldValue of Object.values(fields)) {
diff --git a/ext/bg/js/settings/audio.js b/ext/bg/js/settings/audio.js
index 6d183a43..6f581d9b 100644
--- a/ext/bg/js/settings/audio.js
+++ b/ext/bg/js/settings/audio.js
@@ -16,12 +16,20 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-/*global getOptionsContext, getOptionsMutable, settingsSaveOptions
-AudioSourceUI, audioGetTextToSpeechVoice*/
+/*global getOptionsContext, getOptionsMutable, settingsSaveOptions, apiAudioGetUrl
+AudioSystem, AudioSourceUI*/
 
 let audioSourceUI = null;
+let audioSystem = null;
 
 async function audioSettingsInitialize() {
+    audioSystem = new AudioSystem({
+        getAudioUri: async (definition, source) => {
+            const optionsContext = getOptionsContext();
+            return await apiAudioGetUrl(definition, source, optionsContext);
+        }
+    });
+
     const optionsContext = getOptionsContext();
     const options = await getOptionsMutable(optionsContext);
     audioSourceUI = new AudioSourceUI.Container(
@@ -100,16 +108,11 @@ function textToSpeechVoiceCompare(a, b) {
 function textToSpeechTest() {
     try {
         const text = document.querySelector('#text-to-speech-voice-test').dataset.speechText || '';
-        const voiceURI = document.querySelector('#text-to-speech-voice').value;
-        const voice = audioGetTextToSpeechVoice(voiceURI);
-        if (voice === null) { return; }
+        const voiceUri = document.querySelector('#text-to-speech-voice').value;
 
-        const utterance = new SpeechSynthesisUtterance(text);
-        utterance.lang = 'ja-JP';
-        utterance.voice = voice;
-        utterance.volume = 1.0;
-
-        speechSynthesis.speak(utterance);
+        const audio = audioSystem.createTextToSpeechAudio({text, voiceUri});
+        audio.volume = 1.0;
+        audio.play();
     } catch (e) {
         // NOP
     }
diff --git a/ext/bg/search.html b/ext/bg/search.html
index d6336826..f4c1a737 100644
--- a/ext/bg/search.html
+++ b/ext/bg/search.html
@@ -80,7 +80,7 @@
         <script src="/bg/js/japanese.js"></script>
         <script src="/fg/js/document.js"></script>
         <script src="/fg/js/source.js"></script>
-        <script src="/mixed/js/audio.js"></script>
+        <script src="/mixed/js/audio-system.js"></script>
         <script src="/mixed/js/display-context.js"></script>
         <script src="/mixed/js/display.js"></script>
         <script src="/mixed/js/display-generator.js"></script>
diff --git a/ext/bg/settings.html b/ext/bg/settings.html
index b048a36c..e9fc6be5 100644
--- a/ext/bg/settings.html
+++ b/ext/bg/settings.html
@@ -1098,7 +1098,7 @@
         <script src="/bg/js/page-exit-prevention.js"></script>
         <script src="/bg/js/profile-conditions.js"></script>
         <script src="/bg/js/util.js"></script>
-        <script src="/mixed/js/audio.js"></script>
+        <script src="/mixed/js/audio-system.js"></script>
 
         <script src="/bg/js/settings/anki.js"></script>
         <script src="/bg/js/settings/anki-templates.js"></script>
diff --git a/ext/fg/float.html b/ext/fg/float.html
index 352a866a..7bbed565 100644
--- a/ext/fg/float.html
+++ b/ext/fg/float.html
@@ -46,7 +46,7 @@
 
         <script src="/fg/js/document.js"></script>
         <script src="/fg/js/source.js"></script>
-        <script src="/mixed/js/audio.js"></script>
+        <script src="/mixed/js/audio-system.js"></script>
         <script src="/mixed/js/display-context.js"></script>
         <script src="/mixed/js/display.js"></script>
         <script src="/mixed/js/display-generator.js"></script>
diff --git a/ext/mixed/js/audio-system.js b/ext/mixed/js/audio-system.js
new file mode 100644
index 00000000..31c476b1
--- /dev/null
+++ b/ext/mixed/js/audio-system.js
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2019-2020  Alex Yatskov <alex@foosoft.net>
+ * Author: Alex Yatskov <alex@foosoft.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+class TextToSpeechAudio {
+    constructor(text, voice) {
+        this.text = text;
+        this.voice = voice;
+        this._utterance = null;
+        this._volume = 1;
+    }
+
+    get currentTime() {
+        return 0;
+    }
+    set currentTime(value) {
+        // NOP
+    }
+
+    get volume() {
+        return this._volume;
+    }
+    set volume(value) {
+        this._volume = value;
+        if (this._utterance !== null) {
+            this._utterance.volume = value;
+        }
+    }
+
+    play() {
+        try {
+            if (this._utterance === null) {
+                this._utterance = new SpeechSynthesisUtterance(this.text || '');
+                this._utterance.lang = 'ja-JP';
+                this._utterance.volume = this._volume;
+                this._utterance.voice = this.voice;
+            }
+
+            speechSynthesis.cancel();
+            speechSynthesis.speak(this._utterance);
+        } catch (e) {
+            // NOP
+        }
+    }
+
+    pause() {
+        try {
+            speechSynthesis.cancel();
+        } catch (e) {
+            // NOP
+        }
+    }
+}
+
+class AudioSystem {
+    constructor({getAudioUri}) {
+        this._cache = new Map();
+        this._cacheSizeMaximum = 32;
+        this._getAudioUri = getAudioUri;
+
+        if (typeof speechSynthesis !== 'undefined') {
+            // speechSynthesis.getVoices() will not be populated unless some API call is made.
+            speechSynthesis.addEventListener('voiceschanged', this._onVoicesChanged.bind(this));
+        }
+    }
+
+    async getDefinitionAudio(definition, sources, details) {
+        const key = `${definition.expression}:${definition.reading}`;
+        const cacheValue = this._cache.get(definition);
+        if (typeof cacheValue !== 'undefined') {
+            const {audio, uri, source} = cacheValue;
+            return {audio, uri, source};
+        }
+
+        for (const source of sources) {
+            const uri = await this._getAudioUri(definition, source, details);
+            if (uri === null) { continue; }
+
+            try {
+                const audio = await this._createAudio(uri, details);
+                this._cacheCheck();
+                this._cache.set(key, {audio, uri, source});
+                return {audio, uri, source};
+            } catch (e) {
+                // NOP
+            }
+        }
+
+        throw new Error('Could not create audio');
+    }
+
+    createTextToSpeechAudio({text, voiceUri}) {
+        const voice = this._getTextToSpeechVoiceFromVoiceUri(voiceUri);
+        if (voice === null) {
+            throw new Error('Invalid text-to-speech voice');
+        }
+        return new TextToSpeechAudio(text, voice);
+    }
+
+    _onVoicesChanged() {
+        // NOP
+    }
+
+    async _createAudio(uri, details) {
+        const ttsParameters = this._getTextToSpeechParameters(uri);
+        if (ttsParameters !== null) {
+            if (typeof details === 'object' && details !== null) {
+                if (details.tts === false) {
+                    throw new Error('Text-to-speech not permitted');
+                }
+            }
+            return this.createTextToSpeechAudio(ttsParameters);
+        }
+
+        return await this._createAudioFromUrl(uri);
+    }
+
+    _createAudioFromUrl(url) {
+        return new Promise((resolve, reject) => {
+            const audio = new Audio(url);
+            audio.addEventListener('loadeddata', () => {
+                const duration = audio.duration;
+                if (duration === 5.694694 || duration === 5.720718) {
+                    // Hardcoded values for invalid audio
+                    reject(new Error('Could not retrieve audio'));
+                } else {
+                    resolve(audio);
+                }
+            });
+            audio.addEventListener('error', () => reject(audio.error));
+        });
+    }
+
+    _getTextToSpeechVoiceFromVoiceUri(voiceUri) {
+        try {
+            for (const voice of speechSynthesis.getVoices()) {
+                if (voice.voiceURI === voiceUri) {
+                    return voice;
+                }
+            }
+        } catch (e) {
+            // NOP
+        }
+        return null;
+    }
+
+    _getTextToSpeechParameters(uri) {
+        const m = /^tts:[^#?]*\?([^#]*)/.exec(uri);
+        if (m === null) { return null; }
+
+        const searchParameters = new URLSearchParams(m[1]);
+        const text = searchParameters.get('text');
+        const voiceUri = searchParameters.get('voice');
+        return (text !== null && voiceUri !== null ? {text, voiceUri} : null);
+    }
+
+    _cacheCheck() {
+        const removeCount = this._cache.size - this._cacheSizeMaximum;
+        if (removeCount <= 0) { return; }
+
+        const removeKeys = [];
+        for (const key of this._cache.keys()) {
+            removeKeys.push(key);
+            if (removeKeys.length >= removeCount) { break; }
+        }
+
+        for (const key of removeKeys) {
+            this._cache.delete(key);
+        }
+    }
+}
diff --git a/ext/mixed/js/audio.js b/ext/mixed/js/audio.js
deleted file mode 100644
index b5a025be..00000000
--- a/ext/mixed/js/audio.js
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (C) 2019-2020  Alex Yatskov <alex@foosoft.net>
- * Author: Alex Yatskov <alex@foosoft.net>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-/*global apiAudioGetUrl*/
-
-class TextToSpeechAudio {
-    constructor(text, voice) {
-        this.text = text;
-        this.voice = voice;
-        this._utterance = null;
-        this._volume = 1;
-    }
-
-    get currentTime() {
-        return 0;
-    }
-    set currentTime(value) {
-        // NOP
-    }
-
-    get volume() {
-        return this._volume;
-    }
-    set volume(value) {
-        this._volume = value;
-        if (this._utterance !== null) {
-            this._utterance.volume = value;
-        }
-    }
-
-    play() {
-        try {
-            if (this._utterance === null) {
-                this._utterance = new SpeechSynthesisUtterance(this.text || '');
-                this._utterance.lang = 'ja-JP';
-                this._utterance.volume = this._volume;
-                this._utterance.voice = this.voice;
-            }
-
-            speechSynthesis.cancel();
-            speechSynthesis.speak(this._utterance);
-        } catch (e) {
-            // NOP
-        }
-    }
-
-    pause() {
-        try {
-            speechSynthesis.cancel();
-        } catch (e) {
-            // NOP
-        }
-    }
-
-    static createFromUri(ttsUri) {
-        const m = /^tts:[^#?]*\?([^#]*)/.exec(ttsUri);
-        if (m === null) { return null; }
-
-        const searchParameters = new URLSearchParams(m[1]);
-        const text = searchParameters.get('text');
-        let voice = searchParameters.get('voice');
-        if (text === null || voice === null) { return null; }
-
-        voice = audioGetTextToSpeechVoice(voice);
-        if (voice === null) { return null; }
-
-        return new TextToSpeechAudio(text, voice);
-    }
-}
-
-function audioGetFromUrl(url, willDownload) {
-    const tts = TextToSpeechAudio.createFromUri(url);
-    if (tts !== null) {
-        if (willDownload) {
-            throw new Error('AnkiConnect does not support downloading text-to-speech audio.');
-        }
-        return Promise.resolve(tts);
-    }
-
-    return new Promise((resolve, reject) => {
-        const audio = new Audio(url);
-        audio.addEventListener('loadeddata', () => {
-            if (audio.duration === 5.694694 || audio.duration === 5.720718) {
-                // Hardcoded values for invalid audio
-                reject(new Error('Could not retrieve audio'));
-            } else {
-                resolve(audio);
-            }
-        });
-        audio.addEventListener('error', () => reject(audio.error));
-    });
-}
-
-async function audioGetFromSources(expression, sources, optionsContext, willDownload, cache=null) {
-    const key = `${expression.expression}:${expression.reading}`;
-    if (cache !== null) {
-        const cacheValue = cache.get(expression);
-        if (typeof cacheValue !== 'undefined') {
-            return cacheValue;
-        }
-    }
-
-    for (let i = 0, ii = sources.length; i < ii; ++i) {
-        const source = sources[i];
-        const url = await apiAudioGetUrl(expression, source, optionsContext);
-        if (url === null) {
-            continue;
-        }
-
-        try {
-            let audio = await audioGetFromUrl(url, willDownload);
-            if (willDownload) {
-                // AnkiConnect handles downloading URLs into cards
-                audio = null;
-            }
-            const result = {audio, url, source};
-            if (cache !== null) {
-                cache.set(key, result);
-            }
-            return result;
-        } catch (e) {
-            // NOP
-        }
-    }
-    return {audio: null, url: null, source: null};
-}
-
-function audioGetTextToSpeechVoice(voiceURI) {
-    try {
-        for (const voice of speechSynthesis.getVoices()) {
-            if (voice.voiceURI === voiceURI) {
-                return voice;
-            }
-        }
-    } catch (e) {
-        // NOP
-    }
-    return null;
-}
-
-function audioPrepareTextToSpeech(options) {
-    if (
-        audioPrepareTextToSpeech.state ||
-        !options.audio.textToSpeechVoice ||
-        !(
-            options.audio.sources.includes('text-to-speech') ||
-            options.audio.sources.includes('text-to-speech-reading')
-        )
-    ) {
-        // Text-to-speech not in use.
-        return;
-    }
-
-    // Chrome needs this value called once before it will become populated.
-    // The first call will return an empty list.
-    audioPrepareTextToSpeech.state = true;
-    try {
-        speechSynthesis.getVoices();
-    } catch (e) {
-        // NOP
-    }
-}
-audioPrepareTextToSpeech.state = false;
diff --git a/ext/mixed/js/display.js b/ext/mixed/js/display.js
index 6a762a65..3fe8e684 100644
--- a/ext/mixed/js/display.js
+++ b/ext/mixed/js/display.js
@@ -18,9 +18,8 @@
 
 /*global docRangeFromPoint, docSentenceExtract
 apiKanjiFind, apiTermsFind, apiNoteView, apiOptionsGet, apiDefinitionsAddable, apiDefinitionAdd
-apiScreenshotGet, apiForward
-audioPrepareTextToSpeech, audioGetFromSources
-DisplayGenerator, WindowScroll, DisplayContext, DOM*/
+apiScreenshotGet, apiForward, apiAudioGetUrl
+AudioSystem, DisplayGenerator, WindowScroll, DisplayContext, DOM*/
 
 class Display {
     constructor(spinner, container) {
@@ -32,7 +31,7 @@ class Display {
         this.index = 0;
         this.audioPlaying = null;
         this.audioFallback = null;
-        this.audioCache = new Map();
+        this.audioSystem = new AudioSystem({getAudioUri: this._getAudioUri.bind(this)});
         this.styleNode = null;
 
         this.eventListeners = new EventListenerCollection();
@@ -364,7 +363,6 @@ class Display {
         this.updateDocumentOptions(this.options);
         this.updateTheme(this.options.general.popupTheme);
         this.setCustomCss(this.options.general.customPopupCss);
-        audioPrepareTextToSpeech(this.options);
     }
 
     updateDocumentOptions(options) {
@@ -775,16 +773,16 @@ class Display {
             }
 
             const sources = this.options.audio.sources;
-            let {audio, source} = await audioGetFromSources(expression, sources, this.getOptionsContext(), false, this.audioCache);
-            let info;
-            if (audio === null) {
+            let audio, source, info;
+            try {
+                ({audio, source} = await this.audioSystem.getDefinitionAudio(expression, sources));
+                info = `From source ${1 + sources.indexOf(source)}: ${source}`;
+            } catch (e) {
                 if (this.audioFallback === null) {
                     this.audioFallback = new Audio('/mixed/mp3/button.mp3');
                 }
                 audio = this.audioFallback;
                 info = 'Could not find audio';
-            } else {
-                info = `From source ${1 + sources.indexOf(source)}: ${source}`;
             }
 
             const button = this.audioButtonFindImage(entryIndex);
@@ -918,4 +916,9 @@ class Display {
         const key = event.key;
         return (typeof key === 'string' ? (key.length === 1 ? key.toUpperCase() : key) : '');
     }
+
+    async _getAudioUri(definition, source) {
+        const optionsContext = this.getOptionsContext();
+        return await apiAudioGetUrl(definition, source, optionsContext);
+    }
 }