Spaces:
Sleeping
Sleeping
File size: 2,157 Bytes
de3e327 4eb1a13 de3e327 4eb1a13 de3e327 4eb1a13 de3e327 4eb1a13 de3e327 4eb1a13 de3e327 4eb1a13 de3e327 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import gradio as gr
device = "cpu"
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
dtype=torch.bfloat16,
device_map=None
)
tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
def generate_voice(description, text):
prompt = f'<description="{description}"> {text}'
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=500,
temperature=0.4,
top_p=0.9,
do_sample=True
)
generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
frames = len(snac_tokens) // 7
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i * 7:(i + 1) * 7]
codes[0].append((s[0] - 128266) % 4096)
codes[1].extend([(s[1] - 128266) % 4096, (s[4] - 128266) % 4096])
codes[2].extend([
(s[2] - 128266) % 4096,
(s[3] - 128266) % 4096,
(s[5] - 128266) % 4096,
(s[6] - 128266) % 4096
])
codes_tensor = [
torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
for c in codes
]
with torch.inference_mode():
audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
out_path = "output.wav"
sf.write(out_path, audio, 24000)
return out_path
demo = gr.Interface(
fn=generate_voice,
inputs=[
gr.Textbox(label="Voice Description (e.g., calm female voice with British accent)"),
gr.Textbox(label="Text to Speak (type anything you want)")
],
outputs=gr.Audio(label="Generated Speech"),
title="๐๏ธ Maya1 Voice Generator (CPU-only)",
description="Generate expressive emotional speech using Maya1 + SNAC on CPU."
)
if __name__ == "__main__":
demo.launch()
|