#!/usr/bin/env python3
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers.utils import is_flash_attn_2_available
import soundfile as sf
import numpy as np
import json, subprocess, os

print(f"PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

# Generate audio
print("\nGenerating audio with VoxCPM2...")
result = subprocess.run([
    'curl', '-X', 'POST', 'http://192.168.1.127:8101/v1/audio/speech',
    '-H', 'Content-Type: application/json',
    '-d', json.dumps({
        'model': 'openbmb/VoxCPM2',
        'input': '(voix masculine professionnelle, ton sérieux) Bonjour et bienvenue dans ce rapport de chantier. Nous sommes le 23 avril 2026 et nous intervenons sur le chantier de rénovation du centre-ville de Montpellier. L équipe au complet est présente : trois maçons, deux électriciens et un menuisier. Le chantier a démarré à 7 heures du matin.',
        'voice': 'default',
        'cfg_value': 2.0,
        'inference_timesteps': 10
    }),
    '-o', '/tmp/rcc_test.wav'
], capture_output=True, text=True)

if result.returncode != 0 or not os.path.exists('/tmp/rcc_test.wav') or os.path.getsize('/tmp/rcc_test.wav') < 1000:
    print(f"✗ Audio gen failed: {result.stderr or result.returncode}")
    exit(1)

size = os.path.getsize('/tmp/rcc_test.wav')
print(f"✓ Audio saved: /tmp/rcc_test.wav ({size/1024:.0f} KB)")

# Load WAV and resample to 16kHz (Whisper requires 16k)
samples, sr = sf.read('/tmp/rcc_test.wav')
print(f"Loaded: {len(samples)} samples at {sr} Hz, {len(samples)/sr:.1f}s")

# Resample to 16000 Hz
target_sr = 16000
samples_resampled = sf.read('/tmp/rcc_test.wav')[0]
# Simple resample using numpy (for demo; production would use librosa/resampy)
num_samples = int(len(samples_resampled) * target_sr / sr)
samples_16k = np.interp(
    np.linspace(0, len(samples_resampled), num_samples),
    np.linspace(0, len(samples_resampled), len(samples_resampled)),
    samples_resampled
)
print(f"Resampled to: {num_samples} samples at {target_sr} Hz, {num_samples/target_sr:.1f}s")

# Transcribe directly
print("\nLoading Whisper model...")
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3",
    dtype=torch.float16,
    device_map="cuda:0",
)
print(f"Model loaded. Flash Attn 2: {is_flash_attn_2_available()}")

# Process audio
input_features = processor(samples_16k, sampling_rate=target_sr, return_tensors="pt").input_features
input_features = input_features.to("cuda:0", dtype=torch.float16)

# Generate
output_ids = model.generate(input_features, language="fr", max_new_tokens=256)
text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

print(f"\n✓ Transcript: {text}")
print(f"Length: {len(text)} chars")
