| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| import torch | |
| import threading | |
| model_name = "microsoft/phi-2" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| low_cpu_mem_usage=True | |
| ).to(device) | |
| system_prompt = ( | |
| "You are ProTalk, a professional and intelligent AI. " | |
| "You answer clearly, politely, and with insight. " | |
| "Be professional, witty, and helpful in all responses." | |
| ) | |
| def chat_loop(): | |
| history = [] | |
| print("ProTalk Online — type 'exit' to quit.\n") | |
| while True: | |
| user_input = input("User: ") | |
| if user_input.lower() == "exit": | |
| break | |
| prompt = system_prompt + "\n" + "\n".join(history) + f"\nUser: {user_input}\nProTalk:" | |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
| streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) | |
| thread = threading.Thread(target=model.generate, kwargs={ | |
| "input_ids": inputs["input_ids"], | |
| "max_new_tokens": 200, | |
| "do_sample": True, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "streamer": streamer | |
| }) | |
| thread.start() | |
| output_text = "" | |
| for token in streamer: | |
| print(token, end="", flush=True) | |
| output_text += token | |
| thread.join() | |
| print() | |
| history.append(f"User: {user_input}") | |
| history.append(f"ProTalk: {output_text}") | |
| if __name__ == "__main__": | |
| chat_loop() | |