-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsend_request.py
More file actions
35 lines (28 loc) · 1.14 KB
/
send_request.py
File metadata and controls
35 lines (28 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import time
import openai
from transformers import AutoTokenizer
MODEL_PATH="Qwen/Qwen2-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
port=62726
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
key=71432
n_repeat=5000
prompt = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. " * n_repeat + f"The pass key is {key}. Remember it. {key} is the pass key. " + "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. " * n_repeat + "What is the pass key?"
print("prompt length:", len(tokenizer.encode(prompt)))
start_time = time.time()
response = client.chat.completions.create(
model=MODEL_PATH,
messages=[
{"role": "user", "content": prompt},
],
temperature=0,
stream=True,
)
first_token_received=False
for chunk in response:
if chunk.choices[0].delta.content:
if not first_token_received:
print("\n\nTTFT:", time.time() - start_time)
first_token_received=True
print(chunk.choices[0].delta.content, end="", flush=True)
print("\n\nTime used:", time.time() - start_time)