Загрузка данных


from pathlib import Path
import sys
import numpy as np
import torch
import soundfile as sf
from IPython.display import Audio, display

COSYVOICE_ROOT = Path(r"C:\Users\Geodezik\image_generation\CosyVoice_official").resolve()
MATCHA_ROOT = COSYVOICE_ROOT / "third_party" / "Matcha-TTS"

sys.path.insert(0, str(MATCHA_ROOT))
sys.path.insert(0, str(COSYVOICE_ROOT))

MODEL_DIR = COSYVOICE_ROOT / "pretrained_models" / "Fun-CosyVoice3-0.5B"
PROMPT_WAV = COSYVOICE_ROOT / "asset" / "zero_shot_prompt.wav"

OUTPUT_WAV = "official_zh_clean_repo_test.wav"

import torchaudio.functional as AF

def load_wav_no_torchcodec(wav, target_sr, min_sr=16000):
    wav = Path(wav)
    audio, sr = sf.read(str(wav), dtype="float32", always_2d=True)

    speech = torch.from_numpy(audio.T)
    speech = speech.mean(dim=0, keepdim=True)

    print("\n[load_wav_no_torchcodec]")
    print("file:", wav)
    print("exists:", wav.exists())
    print("sr:", sr)
    print("shape:", audio.shape)
    print("duration:", audio.shape[0] / sr)
    print("min/max/std:", speech.min().item(), speech.max().item(), speech.std().item())

    if sr < min_sr:
        raise ValueError(f"sample rate {sr} is lower than min_sr {min_sr}")

    if sr != target_sr:
        speech = AF.resample(speech, sr, target_sr)
        print("resampled to:", target_sr)
        print("resampled shape:", speech.shape)
        print("resampled duration:", speech.shape[1] / target_sr)
        print("resampled min/max/std:", speech.min().item(), speech.max().item(), speech.std().item())

    return speech

import cosyvoice
import cosyvoice.cli.cosyvoice as cosyvoice_cli
import cosyvoice.utils.file_utils as file_utils
import cosyvoice.cli.frontend as frontend

file_utils.load_wav = load_wav_no_torchcodec
frontend.load_wav = load_wav_no_torchcodec

print("=" * 80)
print("IMPORT CHECK")
print("=" * 80)
print("cosyvoice:", cosyvoice.__file__)
print("cosyvoice cli:", cosyvoice_cli.__file__)
print("frontend:", frontend.__file__)
print("file_utils:", file_utils.__file__)

print("\nMODEL_DIR:", MODEL_DIR)
print("MODEL_DIR exists:", MODEL_DIR.exists())

print("\nPROMPT_WAV:", PROMPT_WAV)
print("PROMPT_WAV exists:", PROMPT_WAV.exists())

print("\nTorch:", torch.__version__)
print("CUDA:", torch.cuda.is_available())

try:
    import torchaudio
    print("torchaudio:", torchaudio.__version__)
except Exception as e:
    print("torchaudio error:", repr(e))

try:
    import onnxruntime as ort
    print("onnxruntime:", ort.__version__)
    print("onnxruntime providers:", ort.get_available_providers())
except Exception as e:
    print("onnxruntime error:", repr(e))

from cosyvoice.cli.cosyvoice import AutoModel

cosyvoice = AutoModel(model_dir=str(MODEL_DIR))
print("\nsample_rate:", cosyvoice.sample_rate)
print("cosyvoice object:", type(cosyvoice))

outs = list(
    cosyvoice.inference_zero_shot(
        "八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。",
        "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。",
        str(PROMPT_WAV),
        stream=False,
        speed=1.0,
    )
)

wav = torch.cat([o["tts_speech"] for o in outs], dim=1)

print("\nOUT")
print("shape:", wav.shape)
print("sample_rate:", cosyvoice.sample_rate)
print("duration:", wav.shape[1] / cosyvoice.sample_rate)
print("min/max/std:", wav.min().item(), wav.max().item(), wav.std().item())

audio = wav.detach().cpu().squeeze(0).numpy()
audio = np.clip(audio, -1.0, 1.0)

sf.write(OUTPUT_WAV, audio, cosyvoice.sample_rate)
display(Audio(OUTPUT_WAV))