File size: 2,157 Bytes
de3e327
 
 
 
 
 
4eb1a13
 
de3e327
 
4eb1a13
 
de3e327
 
4eb1a13
de3e327
 
 
4eb1a13
de3e327
 
 
 
 
 
 
 
 
 
 
 
 
4eb1a13
 
 
 
 
 
 
 
 
 
 
 
 
de3e327
 
 
 
 
 
 
 
 
 
 
 
 
4eb1a13
 
de3e327
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import gradio as gr

device = "cpu"

model = AutoModelForCausalLM.from_pretrained(
    "maya-research/maya1",
    dtype=torch.bfloat16,
    device_map=None
)
tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)

def generate_voice(description, text):
    prompt = f'<description="{description}"> {text}'
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.4,
            top_p=0.9,
            do_sample=True
        )
    generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
    snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
    frames = len(snac_tokens) // 7
    codes = [[], [], []]
    for i in range(frames):
        s = snac_tokens[i * 7:(i + 1) * 7]
        codes[0].append((s[0] - 128266) % 4096)
        codes[1].extend([(s[1] - 128266) % 4096, (s[4] - 128266) % 4096])
        codes[2].extend([
            (s[2] - 128266) % 4096,
            (s[3] - 128266) % 4096,
            (s[5] - 128266) % 4096,
            (s[6] - 128266) % 4096
        ])
    codes_tensor = [
        torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
        for c in codes
    ]
    with torch.inference_mode():
        audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
    out_path = "output.wav"
    sf.write(out_path, audio, 24000)
    return out_path

demo = gr.Interface(
    fn=generate_voice,
    inputs=[
        gr.Textbox(label="Voice Description (e.g., calm female voice with British accent)"),
        gr.Textbox(label="Text to Speak (type anything you want)")
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="๐ŸŽ™๏ธ Maya1 Voice Generator (CPU-only)",
    description="Generate expressive emotional speech using Maya1 + SNAC on CPU."
)

if __name__ == "__main__":
    demo.launch()