Spaces:
Sleeping
Sleeping
| # This application creates a Gradio interface for testing the speed of different tokenizers | |
| import gradio as gr | |
| import tiktoken | |
| import time | |
| from transformers import AutoTokenizer | |
| EXAMPLE_MODELS: list = ["gpt2"] | |
| TOKENIZERS : dict = {k: v for k, v in zip(EXAMPLE_MODELS, [AutoTokenizer.from_pretrained(m) for m in EXAMPLE_MODELS])} | |
| def get_tokenizer(model_name): | |
| if model_name in EXAMPLE_MODELS: | |
| return TOKENIZERS[model_name] | |
| else: | |
| return tiktoken.get_encoding("gpt2") | |
| def times_faster(time_1, time_2): | |
| return (time_2 / time_1) * 100 | |
| def run_hf_tokenizer(model_name, text): | |
| tokenizer = get_tokenizer(model_name) | |
| start = time.time() | |
| encoded = tokenizer.encode(text) | |
| end = time.time() | |
| elapsed_time = end - start | |
| print(f"Encoded: {encoded}") | |
| print(f"Time taken by HF tokenizer: {elapsed_time}") | |
| return elapsed_time, encoded | |
| def run_openai_tokenizer(text): | |
| tokenizer = tiktoken.get_encoding("gpt2") | |
| start = time.time() | |
| encoded = tokenizer.encode(text) | |
| end = time.time() | |
| elapsed_time = end - start | |
| print(f"Encoded: {encoded}") | |
| print(f"Time taken by OpenAI tokenizer: {elapsed_time}") | |
| return elapsed_time, encoded | |
| def run_tokenizers(model_name, text): | |
| hf_time, hf_encoded = run_hf_tokenizer(model_name, text) | |
| openai_time, openai_encoded = run_openai_tokenizer(text) | |
| return { | |
| "HF Tokenizer": { | |
| "Time Taken": hf_time, | |
| "Num tokens": len(hf_encoded) | |
| }, | |
| "OpenAI Tokenizer": { | |
| "Time Taken": openai_time, | |
| "Num Tokens": len(openai_encoded) | |
| }, | |
| "Times Faster": str(times_faster(hf_time, openai_time)) + "%" | |
| } | |
| iface = gr.Interface(fn=run_tokenizers, | |
| inputs=[gr.components.Dropdown(EXAMPLE_MODELS, label="Model Name"), | |
| gr.components.Textbox(lines=10, label="Text")], | |
| outputs="json", | |
| title="OpenAI Tokenizer vs HF Tokenizers Speed Test", | |
| examples = [ | |
| ["gpt2", "This is a test of the OpenAI tokenizer vs the HF tokenizer"], | |
| ["gpt2", """ | |
| State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. | |
| π€ Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: | |
| π Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. | |
| πΌοΈ Computer Vision: image classification, object detection, and segmentation. | |
| π£οΈ Audio: automatic speech recognition and audio classification. | |
| π Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. | |
| """] | |
| ] | |
| ) | |
| iface.launch() | |