feat: realistic face avatar + voice input + ASR endpoint

2026-06-12 15:32:04 +08:00
parent 6fe84b6ef8
commit 8191cf4b41
26 changed files with 1934 additions and 228 deletions
@@ -1,14 +1,18 @@
-import { Controller, Get, Post, Body, Param, Res, HttpException, HttpStatus } from '@nestjs/common'
+import { Controller, Get, Post, Body, Param, Res, HttpException, HttpStatus, UseGuards, UploadedFile, UseInterceptors } from '@nestjs/common'
+import { FileInterceptor } from '@nestjs/platform-express'
 import { Response } from 'express'
 import * as fs from 'fs'
+import * as path from 'path'
+import { execSync } from 'child_process'
 import { TtsService } from './tts.service'
+import { JwtAuthGuard } from '../../common/guards/jwt-auth.guard'
 import { Public } from '../../common/decorators/public.decorator'

@Controller('tts')
 export class TtsController {
  constructor(private ttsService: TtsService) {}

-  @Public()
+  @UseGuards(JwtAuthGuard)
  @Post('synthesize')
  async synthesize(@Body('text') text: string, @Body('voice') voice?: string) {
    if (!text || text.length > 500) {
@@ -30,4 +34,36 @@ export class TtsController {
    res.setHeader('Cache-Control', 'public, max-age=31536000')
    stream.pipe(res)
  }
+
+  @UseGuards(JwtAuthGuard)
+  @Post('asr')
+  @UseInterceptors(FileInterceptor('audio', { dest: '/tmp/asr_uploads' }))
+  async recognize(@UploadedFile() file: any) {
+    if (!file) throw new HttpException('请上传音频文件', HttpStatus.BAD_REQUEST)
+    const uploadDir = '/tmp/asr_uploads'
+    if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir, { recursive: true })
+    const ext = path.extname(file.originalname) || '.mp3'
+    const dest = path.join(uploadDir, file.filename + ext)
+    fs.renameSync(file.path, dest)
+    try {
+      if (process.env.OPENAI_API_KEY) {
+        const result = execSync(
+          `curl -s -X POST https://api.openai.com/v1/audio/transcriptions \
+            -H "Authorization: Bearer ${process.env.OPENAI_API_KEY}" \
+            -H "Content-Type: multipart/form-data" \
+            -F "file=@${dest}" \
+            -F "model=whisper-1" \
+            -F "language=zh"`,
+          { encoding: 'utf8', timeout: 30000 },
+        )
+        const parsed = JSON.parse(result)
+        if (parsed.text) return { text: parsed.text.trim() }
+      }
+      const whisperResult = execSync(`whisper "${dest}" --language zh --output_format txt 2>/dev/null`, { encoding: 'utf8', timeout: 60000 })
+      if (whisperResult && whisperResult.trim()) {
+        return { text: whisperResult.trim() }
+      }
+    } catch {}
+    return { text: '' }
+  }
 }
@@ -12,6 +12,18 @@ interface TtsResult {
  durationMs: number
 }

+const VALID_VOICES = new Set([
+  'zh-CN-XiaoxiaoNeural', 'zh-CN-XiaoyiNeural', 'zh-CN-YunjianNeural',
+  'zh-CN-YunxiNeural', 'zh-CN-YunxiaNeural', 'zh-CN-YunyangNeural',
+  'zh-CN-liaoning-XiaobeiNeural', 'zh-CN-shaanxi-XiaoniNeural',
+])
+
+function validateVoice(voice: string): void {
+  if (!VALID_VOICES.has(voice)) {
+    throw new Error(`不支持的语音: ${voice}`)
+  }
+}
+
@Injectable()
 export class TtsService {
  private readonly logger = new Logger(TtsService.name)
@@ -23,6 +35,7 @@ export class TtsService {
  }

  async synthesize(text: string, voice: string = 'zh-CN-XiaoxiaoNeural'): Promise<TtsResult> {
+    validateVoice(voice)
    const hash = crypto.createHash('md5').update(text + voice).digest('hex')
    const filePath = path.join(CACHE_DIR, `${hash}.mp3`)