H5、uniapp、VUE+TS使用火山引擎-流式语音识别,进行语音转文字

H5、uniapp使用火山引擎-流式语音识别,进行语音转文字。后端使用go语言,我使用的是前端vue中录制,然后讲音频数据传递给go,go在直接上传返回识别的文字内容。

火山引擎:https://www.volcengine.com/docs/6561/80818#_3-3-%E9%94%99%E8%AF%AF%E7%A0%81

前端实现:

在前端,使用recorder-core插件来实现录制MP3文件。

npm install recorder-core

完整代码如下:

<template>
  <view class="ar-footer">
    <slot>
      <view class="ar-footer-button">
        <image class="ar-footer-img" :src="keyboardPng" v-if="mode === 1" @click="setMode(2)" />
        <image class="ar-footer-img" :src="voicePng" v-else @click="setMode(1)" />
      </view>
      <view class="ar-footer-wrapper">
        <view class="ar-footer-text" v-if="mode === 1">
          <input type="text" class="ar-footer-input" v-model="text" placeholder="输入文字..." @keydown="handleKeydown" />
          <view class="ar-footer-send" @click="send">发送</view>
        </view>
        <button class="ar-footer-voice" v-else @touchstart="startVoiceRecord" @touchend="endVoiceRecord"
          @mousedown="startVoiceRecord" @mouseup="endVoiceRecord">按住说话</button>
      </view>
    </slot>
  </view>
</template>

<script setup lang="ts">
import { ref } from 'vue'
import keyboardPng from '../../../static/ai-images/keyboard.png'
import voicePng from '../../../static/ai-images/voice.png'
import { getPartnerList } from '@/api/ars_api';

import Recorder from 'recorder-core'
import 'recorder-core/src/engine/mp3'
import 'recorder-core/src/engine/mp3-engine'
import 'recorder-core/src/extensions/waveview'

const mode = ref(1)
const text = ref('')

const props = defineProps({
  onSend: {
    type: Function,
    required: true
  }
})

// 处理键盘事件
const handleKeydown = (event: KeyboardEvent) => {
  if (event.key === 'Enter') {
    send()
  }
}

const setMode = (val: number) => {
  if (val === 2) {
    recOpen();
  } else {
    rec.close();
    rec = null;
  }
  mode.value = val
}

const send = () => {
  props.onSend(text.value)
  text.value = ''
}

// 模拟按住说话功能
let rec: any;
let wave: any;

const startVoiceRecord = async () => {
  if (!rec) {
    console.error("未打开录音");
    return
  }
  rec.start();
  console.log("已开始录音");
};

const endVoiceRecord = () => {
  if (!rec) {
    console.error("未打开录音");
    return
  }
  rec.stop(async (blob: Blob, duration: number) => {
    const result = await getPartnerList(blob);
    props.onSend(result.data.result[0].text)
    text.value = ''
  }, (err: any) => {
    console.error("结束录音出错:" + err);
    rec.close();
    rec = null;
  });
};

const recOpen = async () => {
  try {
    rec = Recorder({
      type: "mp3",
      sampleRate: 16000,
      bitRate: 16,
      onProcess: (buffers: any, powerLevel: any, bufferDuration: any, bufferSampleRate: any, newBufferIdx: any, asyncEnd: any) => {
        // 可实时绘制波形,实时上传(发送)数据
        if (wave) wave.input(buffers[buffers.length - 1], powerLevel, bufferSampleRate);
      }
    });

    // 打开录音,获得权限
    rec.open(() => {
      console.log("录音已打开");
      if (wave) {
        // 创建音频可视化图形绘制对象
        wave = Recorder.WaveView({ elem: wave });
      }
    }, (msg: string, isUserNotAllow: boolean) => {
      console.log((isUserNotAllow ? "UserNotAllow," : "") + "无法录音:" + msg);
    });
  } catch (error) {
    console.error('无法获取麦克风权限:', error);
  }
}
</script>

ars_api上传接口:

//ars_api
import { base_url } from '@/config';
export async function getPartnerList(audioBlob: any) {
  const formData = new FormData();
  formData.append('file', audioBlob, 'recording.mp3');
  const token = uni.getStorageSync('token');
  try {
    const response = await fetch(base_url + '/ars/getAsr', {
      method: 'POST',
      headers: {
        Token: `Bearer ${token}`,
      },
      body: formData,
    });

    if (!response.ok) {
      throw new Error('网络请求失败');
    }

    const map = await response.json();
    if (map.code !== 0) {
      return Promise.reject(map.msg);
    } else {
      return map;
    }
  } catch (err) {
    console.log('请求失败', err);
    return Promise.reject(err);
  }
}

后端实现:

API文件:

// api
package ars

import (
	"context"
	"fmt"
	"go-code-x/dal/model/response"
	"go-code-x/global"
	"go-code-x/utils"
	"io"

	"github.com/cloudwego/hertz/pkg/app"
)

type AsrApi struct {
}

func (a *AsrApi) GetAsr(c context.Context, ctx *app.RequestContext) {
	header, err := ctx.Request.FormFile("file")
	if err != nil {
		response.FailWithMessage("接收文件失败", ctx)
		return
	}

	client := utils.BuildAsrClient()
	client.Appid = global.CONFIG.Volcanoasr.Appid
	client.Token = global.CONFIG.Volcanoasr.Token
	client.Cluster = global.CONFIG.Volcanoasr.Cluster
	client.Format = global.CONFIG.Volcanoasr.AudioFormat

	file, err := header.Open()
	if err != nil {
		response.FailWithMessage("打开文件失败", ctx)
		return
	}
	defer file.Close()

	// 将文件内容读取为字节数组
	audioData, err := io.ReadAll(file)
	if err != nil {
		fmt.Println("fail to read audio file", err.Error())
		return
	}

	asrResponse, err := client.RequestAsr(audioData)
	if err != nil {
		fmt.Println("fail to request asr, ", err.Error())
		return
	}

	response.OkWithData(asrResponse, ctx)
}

调用火山引擎,实现流式语音识别,可以直接使用官方的Demo中的go文件:

package utils

import (
	"bytes"
	"compress/gzip"
	"encoding/binary"
	"encoding/json"
	"errors"
	"fmt"
	"io/ioutil"
	"net/http"

	"github.com/gorilla/websocket"
	uuid "github.com/satori/go.uuid"
)

type ProtocolVersion byte
type MessageType byte
type MessageTypeSpecificFlags byte
type SerializationType byte
type CompressionType byte

const (
	SuccessCode = 1000

	PROTOCOL_VERSION    = ProtocolVersion(0b0001)
	DEFAULT_HEADER_SIZE = 0b0001

	PROTOCOL_VERSION_BITS            = 4
	HEADER_BITS                      = 4
	MESSAGE_TYPE_BITS                = 4
	MESSAGE_TYPE_SPECIFIC_FLAGS_BITS = 4
	MESSAGE_SERIALIZATION_BITS       = 4
	MESSAGE_COMPRESSION_BITS         = 4
	RESERVED_BITS                    = 8

	// Message Type:
	CLIENT_FULL_REQUEST       = MessageType(0b0001)
	CLIENT_AUDIO_ONLY_REQUEST = MessageType(0b0010)
	SERVER_FULL_RESPONSE      = MessageType(0b1001)
	SERVER_ACK                = MessageType(0b1011)
	SERVER_ERROR_RESPONSE     = MessageType(0b1111)

	// Message Type Specific Flags
	NO_SEQUENCE    = MessageTypeSpecificFlags(0b0000) // no check sequence
	POS_SEQUENCE   = MessageTypeSpecificFlags(0b0001)
	NEG_SEQUENCE   = MessageTypeSpecificFlags(0b0010)
	NEG_SEQUENCE_1 = MessageTypeSpecificFlags(0b0011)

	// Message Serialization
	NO_SERIALIZATION = SerializationType(0b0000)
	JSON             = SerializationType(0b0001)
	THRIFT           = SerializationType(0b0011)
	CUSTOM_TYPE      = SerializationType(0b1111)

	// Message Compression
	NO_COMPRESSION     = CompressionType(0b0000)
	GZIP               = CompressionType(0b0001)
	CUSTOM_COMPRESSION = CompressionType(0b1111)
)

// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var DefaultFullClientWsHeader = []byte{0x11, 0x10, 0x11, 0x00}
var DefaultAudioOnlyWsHeader = []byte{0x11, 0x20, 0x11, 0x00}
var DefaultLastAudioWsHeader = []byte{0x11, 0x22, 0x11, 0x00}

func GzipCompress(input []byte) []byte {
	var b bytes.Buffer
	w := gzip.NewWriter(&b)
	w.Write(input)
	w.Close()
	return b.Bytes()
}

func GzipDecompress(input []byte) []byte {
	b := bytes.NewBuffer(input)
	r, _ := gzip.NewReader(b)
	out, _ := ioutil.ReadAll(r)
	r.Close()
	return out
}

type AsrResponse struct {
	Reqid    string   `json:"reqid"`
	Code     int      `json:"code"`
	Message  string   `json:"message"`
	Sequence int      `json:"sequence"`
	Results  []Result `json:"result,omitempty"`
}

type Result struct {
	// required
	Text       string `json:"text"`
	Confidence int    `json:"confidence"`
	// if show_language == true
	Language string `json:"language,omitempty"`
	// if show_utterances == true
	Utterances []Utterance `json:"utterances,omitempty"`
}

type Utterance struct {
	Text      string `json:"text"`
	StartTime int    `json:"start_time"`
	EndTime   int    `json:"end_time"`
	Definite  bool   `json:"definite"`
	Words     []Word `json:"words"`
	// if show_language = true
	Language string `json:"language"`
}

type Word struct {
	Text      string `json:"text"`
	StartTime int    `json:"start_time"`
	EndTime   int    `json:"end_time"`
	Pronounce string `json:"pronounce"`
	// in docs example - blank_time
	BlankDuration int `json:"blank_duration"`
}

type WsHeader struct {
	ProtocolVersion          ProtocolVersion
	DefaultHeaderSize        int
	MessageType              MessageType
	MessageTypeSpecificFlags MessageTypeSpecificFlags
	SerializationType        SerializationType
	CompressionType          CompressionType
}

type RequestAsr interface {
	requestAsr(audio_data []byte)
}

type AsrClient struct {
	Appid    string
	Token    string
	Cluster  string
	Workflow string
	Format   string
	Codec    string
	SegSize  int
	Url      string
}

func BuildAsrClient() AsrClient {
	client := AsrClient{}
	client.Workflow = "audio_in,resample,partition,vad,fe,decode"
	client.SegSize = 160000 // 5s in 16000 sample rate
	client.Format = "wav"   // default wav audio
	client.Codec = "raw"    // default raw codec
	return client
}

func (client *AsrClient) RequestAsr(audioData []byte) (AsrResponse, error) {
	// set token header
	var tokenHeader = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", client.Token)}}
	c, _, err := websocket.DefaultDialer.Dial("wss://openspeech.bytedance.com/api/v2/asr", tokenHeader)
	if err != nil {
		fmt.Println(err)
		return AsrResponse{}, err
	}
	defer c.Close()

	// 1. send full client request
	req := client.constructRequest()
	payload := GzipCompress(req)
	payloadSize := len(payload)
	payloadSizeArr := make([]byte, 4)
	binary.BigEndian.PutUint32(payloadSizeArr, uint32(payloadSize))

	fullClientMsg := make([]byte, len(DefaultFullClientWsHeader))
	copy(fullClientMsg, DefaultFullClientWsHeader)
	fullClientMsg = append(fullClientMsg, payloadSizeArr...)
	fullClientMsg = append(fullClientMsg, payload...)
	c.WriteMessage(websocket.BinaryMessage, fullClientMsg)
	_, msg, err := c.ReadMessage()
	if err != nil {
		fmt.Println("fail to read message fail, err:", err.Error())
		return AsrResponse{}, err
	}
	asrResponse, err := client.parseResponse(msg)
	if err != nil {
		fmt.Println("fail to parse response ", err.Error())
		return AsrResponse{}, err
	}

	// 3. send segment audio request
	for sentSize := 0; sentSize < len(audioData); sentSize += client.SegSize {
		lastAudio := false
		if sentSize+client.SegSize >= len(audioData) {
			lastAudio = true
		}
		dataSlice := make([]byte, 0)
		audioMsg := make([]byte, len(DefaultAudioOnlyWsHeader))
		if !lastAudio {
			dataSlice = audioData[sentSize : sentSize+client.SegSize]
			copy(audioMsg, DefaultAudioOnlyWsHeader)
		} else {
			dataSlice = audioData[sentSize:]
			copy(audioMsg, DefaultLastAudioWsHeader)
		}
		payload = GzipCompress(dataSlice)
		payloadSize := len(payload)
		payloadSizeArr := make([]byte, 4)
		binary.BigEndian.PutUint32(payloadSizeArr, uint32(payloadSize))
		audioMsg = append(audioMsg, payloadSizeArr...)
		audioMsg = append(audioMsg, payload...)
		c.WriteMessage(websocket.BinaryMessage, audioMsg)
		_, msg, err := c.ReadMessage()
		if err != nil {
			fmt.Println("fail to read message fail, err:", err.Error())
			return AsrResponse{}, err
		}
		asrResponse, err = client.parseResponse(msg)
		if err != nil {
			fmt.Println("fail to parse response ", err.Error())
			return AsrResponse{}, err
		}
	}
	return asrResponse, nil
}

func (client *AsrClient) constructRequest() []byte {
	reqid := uuid.NewV4().String()
	req := make(map[string]map[string]interface{})
	req["app"] = make(map[string]interface{})
	req["app"]["appid"] = client.Appid
	req["app"]["cluster"] = client.Cluster
	req["app"]["token"] = client.Token
	req["user"] = make(map[string]interface{})
	req["user"]["uid"] = "uid"
	req["request"] = make(map[string]interface{})
	req["request"]["reqid"] = reqid
	req["request"]["nbest"] = 1
	req["request"]["workflow"] = client.Workflow
	req["request"]["result_type"] = "full"
	req["request"]["sequence"] = 1
	req["audio"] = make(map[string]interface{})
	req["audio"]["format"] = client.Format
	req["audio"]["codec"] = client.Codec
	reqStr, _ := json.Marshal(req)
	return reqStr
}

func (client *AsrClient) parseResponse(msg []byte) (AsrResponse, error) {
	//protocol_version := msg[0] >> 4
	headerSize := msg[0] & 0x0f
	messageType := msg[1] >> 4
	//message_type_specific_flags := msg[1] & 0x0f
	serializationMethod := msg[2] >> 4
	messageCompression := msg[2] & 0x0f
	//reserved := msg[3]
	//header_extensions := msg[4:header_size * 4]
	payload := msg[headerSize*4:]
	payloadMsg := make([]byte, 0)
	payloadSize := 0
	//print('message type: {}'.format(message_type))

	if messageType == byte(SERVER_FULL_RESPONSE) {
		payloadSize = int(int32(binary.BigEndian.Uint32(payload[0:4])))
		payloadMsg = payload[4:]
	} else if messageType == byte(SERVER_ACK) {
		seq := int32(binary.BigEndian.Uint32(payload[:4]))
		if len(payload) >= 8 {
			payloadSize = int(binary.BigEndian.Uint32(payload[4:8]))
			payloadMsg = payload[8:]
		}
		fmt.Println("SERVER_ACK seq: ", seq)
	} else if messageType == byte(SERVER_ERROR_RESPONSE) {
		code := int32(binary.BigEndian.Uint32(payload[:4]))
		payloadSize = int(binary.BigEndian.Uint32(payload[4:8]))
		payloadMsg = payload[8:]
		fmt.Println("SERVER_ERROR_RESPONE code: ", code)
		return AsrResponse{}, errors.New(string(payloadMsg))
	}
	if payloadSize == 0 {
		return AsrResponse{}, errors.New("payload size if 0")
	}
	if messageCompression == byte(GZIP) {
		payloadMsg = GzipDecompress(payloadMsg)
	}

	var asrResponse = AsrResponse{}
	if serializationMethod == byte(JSON) {
		err := json.Unmarshal(payloadMsg, &asrResponse)
		if err != nil {
			fmt.Println("fail to unmarshal response, ", err.Error())
			return AsrResponse{}, err
		}
	}
	return asrResponse, nil
}

总结

前端直接上传音频到火山引擎 ASR 接口失败,WebSocket 请求也遇到问题。由于前端经验有限,最终选择先将音频数据传给后端 Go 服务,再由后端调用 ASR 接口。尽管流程较绕,但速度尚可,用户体验影响不大。未来可进一步研究前端直传方案,以简化流程。

滚动至顶部