《实战AI大模型》AI数字人应用-第04节:基于多个大模型改写数字人应用(代码版)
作者:冰河
星球:http://m6z.cn/6aeFbs
博客:https://binghe.site
文章汇总:https://binghe.site/md/all/all.html
源码获取地址:https://t.zsxq.com/0dhvFs5oR
大家好,我是冰河~~
今天,带着大家一起基于CHATTTS + whisper-tiny + qwen3:8b多个大模型改写数字人应用,开始今天的正题。
一、环境搭建与项目初始化
1.1 硬件与系统要求
- 操作系统:Ubuntu 22.04 LTS(推荐)或 Windows WSL2
- GPU:NVIDIA RTX 4090/3090 (24GB显存) 或 RTX 4080/3080 (16GB显存)
- 显存要求:至少16GB,Qwen3:8b需要约10GB,其他模型约2-3GB
- 存储空间:至少50GB可用空间
- 内存:32GB RAM
1.2 创建项目目录结构
# 创建项目主目录
mkdir ~/ai-digital-human && cd ~/ai-digital-human
# 创建完整的目录结构
mkdir -p {models,src/{asr,tts,llm,api},data/{audio_input,audio_output},logs,configs,docker}
# 创建虚拟环境
python3.10 -m venv venv
source venv/bin/activate
# 创建 requirements.txt
cat > requirements.txt << 'EOF'
torch>=2.0.0
torchaudio>=2.0.0
transformers>=4.36.0
openai-whisper>=20231117
fastapi>=0.104.0
uvicorn[standard]>=0.24.0
pydub>=0.25.1
soundfile>=0.12.1
numpy>=1.24.0
scipy>=1.11.0
pydantic>=2.5.0
websockets>=12.0
python-multipart>=0.0.6
sentencepiece>=0.1.99
accelerate>=0.24.0
einops>=0.7.0
safetensors>=0.4.0
gradio>=4.0.0
langchain>=0.0.350
EOF
# 安装PyTorch(根据CUDA版本选择)
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# 安装其他依赖
pip install -r requirements.txt
1.3 配置系统环境
# 安装系统依赖
sudo apt update && sudo apt install -y \
ffmpeg \
libsndfile1 \
portaudio19-dev \
python3-dev \
build-essential \
git-lfs \
nvidia-cuda-toolkit \
nvidia-driver-535
# 配置Git LFS(用于大文件下载)
git lfs install
# 创建模型下载脚本
cat > download_models.sh << 'EOF'
#!/bin/bash
MODEL_DIR="models"
# 创建模型子目录
mkdir -p $MODEL_DIR/{whisper,chattts,qwen}
echo "开始下载模型文件..."
# 1. 下载whisper-tiny(自动下载)
echo "准备whisper-tiny模型..."
# 2. 下载ChatTTS
echo "克隆ChatTTS仓库..."
cd $MODEL_DIR/chattts
git clone https://github.com/2noise/ChatTTS.git
cd ChatTTS
pip install -r requirements.txt
# 3. 下载Qwen3-8B(使用Hugging Face)
echo "下载Qwen3-8B模型..."
cd ../../$MODEL_DIR/qwen
git clone https://huggingface.co/Qwen/Qwen3-8B-Instruct
echo "模型下载完成!"
EOF
chmod +x download_models.sh
./download_models.sh
二、核心模型部署
2.1 部署 Whisper-tiny 语音识别
# src/asr/whisper_asr.py
import whisper
import numpy as np
from typing import Optional, Dict
import torch
import logging
class WhisperASR:
def __init__(self, model_size: str = "tiny", device: str = "cuda"):
"""
初始化Whisper ASR模型
Args:
model_size: 模型大小 (tiny, base, small, medium, large)
device: 运行设备 (cuda 或 cpu)
"""
self.logger = logging.getLogger(__name__)
self.device = device if torch.cuda.is_available() and device == "cuda" else "cpu"
self.logger.info(f"正在加载Whisper-{model_size}模型到 {self.device}...")
self.model = whisper.load_model(model_size, device=self.device)
self.logger.info("Whisper模型加载完成!")
# 支持的语言代码映射
self.language_codes = {
"中文": "zh",
"英语": "en",
"日语": "ja",
"韩语": "ko",
"法语": "fr",
"西班牙语": "es",
"德语": "de"
}
def transcribe(
self,
audio_path: str,
language: Optional[str] = None,
task: str = "transcribe",
temperature: float = 0.0,
beam_size: int = 5,
**kwargs
) -> Dict:
"""
转录音频文件
Args:
audio_path: 音频文件路径
language: 语言代码 (zh, en, ja等)
task: 任务类型 (transcribe 或 translate)
temperature: 采样温度
beam_size: beam search大小
Returns:
包含转录结果的字典
"""
try:
# 加载音频
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
# 生成mel spectrogram
mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(self.model.device)
# 检测语言(如果未指定)
if language is None:
_, probs = self.model.detect_language(mel)
language = max(probs, key=probs.get)
self.logger.info(f"检测到语言: {language}, 置信度: {probs[language]:.2f}")
# 解码选项
options = whisper.DecodingOptions(
language=language,
task=task,
temperature=temperature,
beam_size=beam_size,
fp16=self.device == "cuda",
**kwargs
)
# 执行转录
result = whisper.decode(self.model, mel, options)
# 完整转录(带时间戳)
full_result = self.model.transcribe(
audio_path,
language=language,
task=task,
temperature=temperature,
**kwargs
)
return {
"text": result.text,
"language": language,
"segments": full_result.get("segments", []),
"confidence": np.exp(result.audio_features.mean().item()),
"success": True
}
except Exception as e:
self.logger.error(f"转录失败: {str(e)}")
return {
"text": "",
"language": language or "unknown",
"segments": [],
"confidence": 0.0,
"success": False,
"error": str(e)
}
def realtime_transcribe(self, audio_chunk: np.ndarray) -> str:
"""
实时转录音频片段
Args:
audio_chunk: 音频数据数组
Returns:
转录文本
"""
# 这里可以扩展为实时转录逻辑
pass
def translate(self, audio_path: str, target_lang: str = "en") -> Dict:
"""
翻译音频内容
Args:
audio_path: 音频文件路径
target_lang: 目标语言代码
Returns:
翻译结果
"""
return self.transcribe(audio_path, task="translate", language=target_lang)
# 测试脚本
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
asr = WhisperASR(model_size="tiny")
# 测试转录
if len(sys.argv) > 1:
result = asr.transcribe(sys.argv[1], language="zh")
print(f"转录结果: {result['text']}")
print(f"语言: {result['language']}")
print(f"置信度: {result['confidence']:.4f}")
else:
print("请提供音频文件路径")
2.2 部署 Qwen3-8B 对话模型
查看完整文章
加入冰河技术知识星球,解锁完整技术文章、小册、视频与完整代码
