H5、uniapp使用火山引擎-流式语音识别,进行语音转文字。后端使用go语言,我使用的是前端vue中录制,然后讲音频数据传递给go,go在直接上传返回识别的文字内容。
火山引擎:https://www.volcengine.com/docs/6561/80818#_3-3-%E9%94%99%E8%AF%AF%E7%A0%81
前端实现:
在前端,使用recorder-core插件来实现录制MP3文件。
npm install recorder-core
完整代码如下:
<template>
<view class="ar-footer">
<slot>
<view class="ar-footer-button">
<image class="ar-footer-img" :src="keyboardPng" v-if="mode === 1" @click="setMode(2)" />
<image class="ar-footer-img" :src="voicePng" v-else @click="setMode(1)" />
</view>
<view class="ar-footer-wrapper">
<view class="ar-footer-text" v-if="mode === 1">
<input type="text" class="ar-footer-input" v-model="text" placeholder="输入文字..." @keydown="handleKeydown" />
<view class="ar-footer-send" @click="send">发送</view>
</view>
<button class="ar-footer-voice" v-else @touchstart="startVoiceRecord" @touchend="endVoiceRecord"
@mousedown="startVoiceRecord" @mouseup="endVoiceRecord">按住说话</button>
</view>
</slot>
</view>
</template>
<script setup lang="ts">
import { ref } from 'vue'
import keyboardPng from '../../../static/ai-images/keyboard.png'
import voicePng from '../../../static/ai-images/voice.png'
import { getPartnerList } from '@/api/ars_api';
import Recorder from 'recorder-core'
import 'recorder-core/src/engine/mp3'
import 'recorder-core/src/engine/mp3-engine'
import 'recorder-core/src/extensions/waveview'
const mode = ref(1)
const text = ref('')
const props = defineProps({
onSend: {
type: Function,
required: true
}
})
// 处理键盘事件
const handleKeydown = (event: KeyboardEvent) => {
if (event.key === 'Enter') {
send()
}
}
const setMode = (val: number) => {
if (val === 2) {
recOpen();
} else {
rec.close();
rec = null;
}
mode.value = val
}
const send = () => {
props.onSend(text.value)
text.value = ''
}
// 模拟按住说话功能
let rec: any;
let wave: any;
const startVoiceRecord = async () => {
if (!rec) {
console.error("未打开录音");
return
}
rec.start();
console.log("已开始录音");
};
const endVoiceRecord = () => {
if (!rec) {
console.error("未打开录音");
return
}
rec.stop(async (blob: Blob, duration: number) => {
const result = await getPartnerList(blob);
props.onSend(result.data.result[0].text)
text.value = ''
}, (err: any) => {
console.error("结束录音出错:" + err);
rec.close();
rec = null;
});
};
const recOpen = async () => {
try {
rec = Recorder({
type: "mp3",
sampleRate: 16000,
bitRate: 16,
onProcess: (buffers: any, powerLevel: any, bufferDuration: any, bufferSampleRate: any, newBufferIdx: any, asyncEnd: any) => {
// 可实时绘制波形,实时上传(发送)数据
if (wave) wave.input(buffers[buffers.length - 1], powerLevel, bufferSampleRate);
}
});
// 打开录音,获得权限
rec.open(() => {
console.log("录音已打开");
if (wave) {
// 创建音频可视化图形绘制对象
wave = Recorder.WaveView({ elem: wave });
}
}, (msg: string, isUserNotAllow: boolean) => {
console.log((isUserNotAllow ? "UserNotAllow," : "") + "无法录音:" + msg);
});
} catch (error) {
console.error('无法获取麦克风权限:', error);
}
}
</script>
ars_api上传接口:
//ars_api
import { base_url } from '@/config';
export async function getPartnerList(audioBlob: any) {
const formData = new FormData();
formData.append('file', audioBlob, 'recording.mp3');
const token = uni.getStorageSync('token');
try {
const response = await fetch(base_url + '/ars/getAsr', {
method: 'POST',
headers: {
Token: `Bearer ${token}`,
},
body: formData,
});
if (!response.ok) {
throw new Error('网络请求失败');
}
const map = await response.json();
if (map.code !== 0) {
return Promise.reject(map.msg);
} else {
return map;
}
} catch (err) {
console.log('请求失败', err);
return Promise.reject(err);
}
}
后端实现:
API文件:
// api
package ars
import (
"context"
"fmt"
"go-code-x/dal/model/response"
"go-code-x/global"
"go-code-x/utils"
"io"
"github.com/cloudwego/hertz/pkg/app"
)
type AsrApi struct {
}
func (a *AsrApi) GetAsr(c context.Context, ctx *app.RequestContext) {
header, err := ctx.Request.FormFile("file")
if err != nil {
response.FailWithMessage("接收文件失败", ctx)
return
}
client := utils.BuildAsrClient()
client.Appid = global.CONFIG.Volcanoasr.Appid
client.Token = global.CONFIG.Volcanoasr.Token
client.Cluster = global.CONFIG.Volcanoasr.Cluster
client.Format = global.CONFIG.Volcanoasr.AudioFormat
file, err := header.Open()
if err != nil {
response.FailWithMessage("打开文件失败", ctx)
return
}
defer file.Close()
// 将文件内容读取为字节数组
audioData, err := io.ReadAll(file)
if err != nil {
fmt.Println("fail to read audio file", err.Error())
return
}
asrResponse, err := client.RequestAsr(audioData)
if err != nil {
fmt.Println("fail to request asr, ", err.Error())
return
}
response.OkWithData(asrResponse, ctx)
}
调用火山引擎,实现流式语音识别,可以直接使用官方的Demo中的go文件:
package utils
import (
"bytes"
"compress/gzip"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"github.com/gorilla/websocket"
uuid "github.com/satori/go.uuid"
)
type ProtocolVersion byte
type MessageType byte
type MessageTypeSpecificFlags byte
type SerializationType byte
type CompressionType byte
const (
SuccessCode = 1000
PROTOCOL_VERSION = ProtocolVersion(0b0001)
DEFAULT_HEADER_SIZE = 0b0001
PROTOCOL_VERSION_BITS = 4
HEADER_BITS = 4
MESSAGE_TYPE_BITS = 4
MESSAGE_TYPE_SPECIFIC_FLAGS_BITS = 4
MESSAGE_SERIALIZATION_BITS = 4
MESSAGE_COMPRESSION_BITS = 4
RESERVED_BITS = 8
// Message Type:
CLIENT_FULL_REQUEST = MessageType(0b0001)
CLIENT_AUDIO_ONLY_REQUEST = MessageType(0b0010)
SERVER_FULL_RESPONSE = MessageType(0b1001)
SERVER_ACK = MessageType(0b1011)
SERVER_ERROR_RESPONSE = MessageType(0b1111)
// Message Type Specific Flags
NO_SEQUENCE = MessageTypeSpecificFlags(0b0000) // no check sequence
POS_SEQUENCE = MessageTypeSpecificFlags(0b0001)
NEG_SEQUENCE = MessageTypeSpecificFlags(0b0010)
NEG_SEQUENCE_1 = MessageTypeSpecificFlags(0b0011)
// Message Serialization
NO_SERIALIZATION = SerializationType(0b0000)
JSON = SerializationType(0b0001)
THRIFT = SerializationType(0b0011)
CUSTOM_TYPE = SerializationType(0b1111)
// Message Compression
NO_COMPRESSION = CompressionType(0b0000)
GZIP = CompressionType(0b0001)
CUSTOM_COMPRESSION = CompressionType(0b1111)
)
// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var DefaultFullClientWsHeader = []byte{0x11, 0x10, 0x11, 0x00}
var DefaultAudioOnlyWsHeader = []byte{0x11, 0x20, 0x11, 0x00}
var DefaultLastAudioWsHeader = []byte{0x11, 0x22, 0x11, 0x00}
func GzipCompress(input []byte) []byte {
var b bytes.Buffer
w := gzip.NewWriter(&b)
w.Write(input)
w.Close()
return b.Bytes()
}
func GzipDecompress(input []byte) []byte {
b := bytes.NewBuffer(input)
r, _ := gzip.NewReader(b)
out, _ := ioutil.ReadAll(r)
r.Close()
return out
}
type AsrResponse struct {
Reqid string `json:"reqid"`
Code int `json:"code"`
Message string `json:"message"`
Sequence int `json:"sequence"`
Results []Result `json:"result,omitempty"`
}
type Result struct {
// required
Text string `json:"text"`
Confidence int `json:"confidence"`
// if show_language == true
Language string `json:"language,omitempty"`
// if show_utterances == true
Utterances []Utterance `json:"utterances,omitempty"`
}
type Utterance struct {
Text string `json:"text"`
StartTime int `json:"start_time"`
EndTime int `json:"end_time"`
Definite bool `json:"definite"`
Words []Word `json:"words"`
// if show_language = true
Language string `json:"language"`
}
type Word struct {
Text string `json:"text"`
StartTime int `json:"start_time"`
EndTime int `json:"end_time"`
Pronounce string `json:"pronounce"`
// in docs example - blank_time
BlankDuration int `json:"blank_duration"`
}
type WsHeader struct {
ProtocolVersion ProtocolVersion
DefaultHeaderSize int
MessageType MessageType
MessageTypeSpecificFlags MessageTypeSpecificFlags
SerializationType SerializationType
CompressionType CompressionType
}
type RequestAsr interface {
requestAsr(audio_data []byte)
}
type AsrClient struct {
Appid string
Token string
Cluster string
Workflow string
Format string
Codec string
SegSize int
Url string
}
func BuildAsrClient() AsrClient {
client := AsrClient{}
client.Workflow = "audio_in,resample,partition,vad,fe,decode"
client.SegSize = 160000 // 5s in 16000 sample rate
client.Format = "wav" // default wav audio
client.Codec = "raw" // default raw codec
return client
}
func (client *AsrClient) RequestAsr(audioData []byte) (AsrResponse, error) {
// set token header
var tokenHeader = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", client.Token)}}
c, _, err := websocket.DefaultDialer.Dial("wss://openspeech.bytedance.com/api/v2/asr", tokenHeader)
if err != nil {
fmt.Println(err)
return AsrResponse{}, err
}
defer c.Close()
// 1. send full client request
req := client.constructRequest()
payload := GzipCompress(req)
payloadSize := len(payload)
payloadSizeArr := make([]byte, 4)
binary.BigEndian.PutUint32(payloadSizeArr, uint32(payloadSize))
fullClientMsg := make([]byte, len(DefaultFullClientWsHeader))
copy(fullClientMsg, DefaultFullClientWsHeader)
fullClientMsg = append(fullClientMsg, payloadSizeArr...)
fullClientMsg = append(fullClientMsg, payload...)
c.WriteMessage(websocket.BinaryMessage, fullClientMsg)
_, msg, err := c.ReadMessage()
if err != nil {
fmt.Println("fail to read message fail, err:", err.Error())
return AsrResponse{}, err
}
asrResponse, err := client.parseResponse(msg)
if err != nil {
fmt.Println("fail to parse response ", err.Error())
return AsrResponse{}, err
}
// 3. send segment audio request
for sentSize := 0; sentSize < len(audioData); sentSize += client.SegSize {
lastAudio := false
if sentSize+client.SegSize >= len(audioData) {
lastAudio = true
}
dataSlice := make([]byte, 0)
audioMsg := make([]byte, len(DefaultAudioOnlyWsHeader))
if !lastAudio {
dataSlice = audioData[sentSize : sentSize+client.SegSize]
copy(audioMsg, DefaultAudioOnlyWsHeader)
} else {
dataSlice = audioData[sentSize:]
copy(audioMsg, DefaultLastAudioWsHeader)
}
payload = GzipCompress(dataSlice)
payloadSize := len(payload)
payloadSizeArr := make([]byte, 4)
binary.BigEndian.PutUint32(payloadSizeArr, uint32(payloadSize))
audioMsg = append(audioMsg, payloadSizeArr...)
audioMsg = append(audioMsg, payload...)
c.WriteMessage(websocket.BinaryMessage, audioMsg)
_, msg, err := c.ReadMessage()
if err != nil {
fmt.Println("fail to read message fail, err:", err.Error())
return AsrResponse{}, err
}
asrResponse, err = client.parseResponse(msg)
if err != nil {
fmt.Println("fail to parse response ", err.Error())
return AsrResponse{}, err
}
}
return asrResponse, nil
}
func (client *AsrClient) constructRequest() []byte {
reqid := uuid.NewV4().String()
req := make(map[string]map[string]interface{})
req["app"] = make(map[string]interface{})
req["app"]["appid"] = client.Appid
req["app"]["cluster"] = client.Cluster
req["app"]["token"] = client.Token
req["user"] = make(map[string]interface{})
req["user"]["uid"] = "uid"
req["request"] = make(map[string]interface{})
req["request"]["reqid"] = reqid
req["request"]["nbest"] = 1
req["request"]["workflow"] = client.Workflow
req["request"]["result_type"] = "full"
req["request"]["sequence"] = 1
req["audio"] = make(map[string]interface{})
req["audio"]["format"] = client.Format
req["audio"]["codec"] = client.Codec
reqStr, _ := json.Marshal(req)
return reqStr
}
func (client *AsrClient) parseResponse(msg []byte) (AsrResponse, error) {
//protocol_version := msg[0] >> 4
headerSize := msg[0] & 0x0f
messageType := msg[1] >> 4
//message_type_specific_flags := msg[1] & 0x0f
serializationMethod := msg[2] >> 4
messageCompression := msg[2] & 0x0f
//reserved := msg[3]
//header_extensions := msg[4:header_size * 4]
payload := msg[headerSize*4:]
payloadMsg := make([]byte, 0)
payloadSize := 0
//print('message type: {}'.format(message_type))
if messageType == byte(SERVER_FULL_RESPONSE) {
payloadSize = int(int32(binary.BigEndian.Uint32(payload[0:4])))
payloadMsg = payload[4:]
} else if messageType == byte(SERVER_ACK) {
seq := int32(binary.BigEndian.Uint32(payload[:4]))
if len(payload) >= 8 {
payloadSize = int(binary.BigEndian.Uint32(payload[4:8]))
payloadMsg = payload[8:]
}
fmt.Println("SERVER_ACK seq: ", seq)
} else if messageType == byte(SERVER_ERROR_RESPONSE) {
code := int32(binary.BigEndian.Uint32(payload[:4]))
payloadSize = int(binary.BigEndian.Uint32(payload[4:8]))
payloadMsg = payload[8:]
fmt.Println("SERVER_ERROR_RESPONE code: ", code)
return AsrResponse{}, errors.New(string(payloadMsg))
}
if payloadSize == 0 {
return AsrResponse{}, errors.New("payload size if 0")
}
if messageCompression == byte(GZIP) {
payloadMsg = GzipDecompress(payloadMsg)
}
var asrResponse = AsrResponse{}
if serializationMethod == byte(JSON) {
err := json.Unmarshal(payloadMsg, &asrResponse)
if err != nil {
fmt.Println("fail to unmarshal response, ", err.Error())
return AsrResponse{}, err
}
}
return asrResponse, nil
}
总结
前端直接上传音频到火山引擎 ASR 接口失败,WebSocket 请求也遇到问题。由于前端经验有限,最终选择先将音频数据传给后端 Go 服务,再由后端调用 ASR 接口。尽管流程较绕,但速度尚可,用户体验影响不大。未来可进一步研究前端直传方案,以简化流程。