from pathlib import Path
import sys
import numpy as np
import torch
import soundfile as sf
from IPython.display import Audio, display
COSYVOICE_ROOT = Path(r"C:\Users\Geodezik\image_generation\CosyVoice_official").resolve()
MATCHA_ROOT = COSYVOICE_ROOT / "third_party" / "Matcha-TTS"
sys.path.insert(0, str(MATCHA_ROOT))
sys.path.insert(0, str(COSYVOICE_ROOT))
MODEL_DIR = COSYVOICE_ROOT / "pretrained_models" / "Fun-CosyVoice3-0.5B"
PROMPT_WAV = COSYVOICE_ROOT / "asset" / "zero_shot_prompt.wav"
OUTPUT_WAV = "official_zh_clean_repo_test.wav"
import torchaudio.functional as AF
def load_wav_no_torchcodec(wav, target_sr, min_sr=16000):
wav = Path(wav)
audio, sr = sf.read(str(wav), dtype="float32", always_2d=True)
speech = torch.from_numpy(audio.T)
speech = speech.mean(dim=0, keepdim=True)
print("\n[load_wav_no_torchcodec]")
print("file:", wav)
print("exists:", wav.exists())
print("sr:", sr)
print("shape:", audio.shape)
print("duration:", audio.shape[0] / sr)
print("min/max/std:", speech.min().item(), speech.max().item(), speech.std().item())
if sr < min_sr:
raise ValueError(f"sample rate {sr} is lower than min_sr {min_sr}")
if sr != target_sr:
speech = AF.resample(speech, sr, target_sr)
print("resampled to:", target_sr)
print("resampled shape:", speech.shape)
print("resampled duration:", speech.shape[1] / target_sr)
print("resampled min/max/std:", speech.min().item(), speech.max().item(), speech.std().item())
return speech
import cosyvoice
import cosyvoice.cli.cosyvoice as cosyvoice_cli
import cosyvoice.utils.file_utils as file_utils
import cosyvoice.cli.frontend as frontend
file_utils.load_wav = load_wav_no_torchcodec
frontend.load_wav = load_wav_no_torchcodec
print("=" * 80)
print("IMPORT CHECK")
print("=" * 80)
print("cosyvoice:", cosyvoice.__file__)
print("cosyvoice cli:", cosyvoice_cli.__file__)
print("frontend:", frontend.__file__)
print("file_utils:", file_utils.__file__)
print("\nMODEL_DIR:", MODEL_DIR)
print("MODEL_DIR exists:", MODEL_DIR.exists())
print("\nPROMPT_WAV:", PROMPT_WAV)
print("PROMPT_WAV exists:", PROMPT_WAV.exists())
print("\nTorch:", torch.__version__)
print("CUDA:", torch.cuda.is_available())
try:
import torchaudio
print("torchaudio:", torchaudio.__version__)
except Exception as e:
print("torchaudio error:", repr(e))
try:
import onnxruntime as ort
print("onnxruntime:", ort.__version__)
print("onnxruntime providers:", ort.get_available_providers())
except Exception as e:
print("onnxruntime error:", repr(e))
from cosyvoice.cli.cosyvoice import AutoModel
cosyvoice = AutoModel(model_dir=str(MODEL_DIR))
print("\nsample_rate:", cosyvoice.sample_rate)
print("cosyvoice object:", type(cosyvoice))
outs = list(
cosyvoice.inference_zero_shot(
"八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。",
"You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。",
str(PROMPT_WAV),
stream=False,
speed=1.0,
)
)
wav = torch.cat([o["tts_speech"] for o in outs], dim=1)
print("\nOUT")
print("shape:", wav.shape)
print("sample_rate:", cosyvoice.sample_rate)
print("duration:", wav.shape[1] / cosyvoice.sample_rate)
print("min/max/std:", wav.min().item(), wav.max().item(), wav.std().item())
audio = wav.detach().cpu().squeeze(0).numpy()
audio = np.clip(audio, -1.0, 1.0)
sf.write(OUTPUT_WAV, audio, cosyvoice.sample_rate)
display(Audio(OUTPUT_WAV))